添加dtk中的cub头文件

f8a481f8 · zhouxiang · 7b7c64c5 · f8a481f8 · f8a481f8 · f8a481f8
Commit f8a481f8 authored Oct 13, 2023 by zhouxiang
20 changed files
--- a/3rdparty/cub/rocprim/device/device_segmented_reduce.hpp
+++ b/3rdparty/cub/rocprim/device/device_segmented_reduce.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
+
+#include <type_traits>
+#include <iterator>
+#include <chrono>
+
+#include "device_reduce_config.hpp"
+
+#include "../config.hpp"
+#include "../functional.hpp"
+#include "../detail/various.hpp"
+#include "../detail/match_result_type.hpp"
+
+#include "detail/device_segmented_reduce.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup devicemodule
+/// @{
+
+namespace detail
+{
+
+template<
+    class Config,
+    class InputIterator,
+    class OutputIterator,
+    class OffsetIterator,
+    class ResultType,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
+void segmented_reduce_kernel(InputIterator input,
+                             OutputIterator output,
+                             OffsetIterator begin_offsets,
+                             OffsetIterator end_offsets,
+                             BinaryFunction reduce_op,
+                             ResultType initial_value)
+{
+    segmented_reduce<Config>(
+        input, output,
+        begin_offsets, end_offsets,
+        reduce_op, initial_value
+    );
+}
+
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+
+template<
+    class Config,
+    class InputIterator,
+    class OutputIterator,
+    class OffsetIterator,
+    class InitValueType,
+    class BinaryFunction
+>
+inline
+cudaError_t segmented_reduce_impl(void * temporary_storage,
+                                 size_t& storage_size,
+                                 InputIterator input,
+                                 OutputIterator output,
+                                 unsigned int segments,
+                                 OffsetIterator begin_offsets,
+                                 OffsetIterator end_offsets,
+                                 BinaryFunction reduce_op,
+                                 InitValueType initial_value,
+                                 cudaStream_t stream,
+                                 bool debug_synchronous)
+{
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    using result_type = typename ::rocprim::detail::match_result_type<
+        input_type, BinaryFunction
+    >::type;
+
+    // Get default config if Config is default_config
+    using config = default_or_custom_config<
+        Config,
+        default_reduce_config<ROCPRIM_TARGET_ARCH, result_type>
+    >;
+
+    constexpr unsigned int block_size = config::block_size;
+
+    if(temporary_storage == nullptr)
+    {
+        // Make sure user won't try to allocate 0 bytes memory, because
+        // cudaMalloc will return nullptr when size is zero.
+        storage_size = 4;
+        return cudaSuccess;
+    }
+
+    if( segments == 0u )
+        return cudaSuccess;
+
+    std::chrono::high_resolution_clock::time_point start;
+
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    segmented_reduce_kernel<config>
+        <<<dim3(segments), dim3(block_size), 0, stream>>>(
+        input, output,
+        begin_offsets, end_offsets,
+        reduce_op, static_cast<result_type>(initial_value)
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_reduce", segments, start);
+
+    return cudaSuccess;
+}
+
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+
+} // end of detail namespace
+
+/// \brief Parallel segmented reduction primitive for device level.
+///
+/// segmented_reduce function performs a device-wide reduction operation across multiple sequences
+/// using binary \p reduce_op operator.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input must have at least \p size elements, \p output must have
+/// \p segments elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam BinaryFunction - type of binary function used for reduction. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+/// \tparam InitValueType - type of the initial value.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the reduction operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to reduce.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] initial_value - initial value to start the reduction.
+/// \param [in] reduce_op - binary operation function object that will be used for reduction.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level segmented min-reduction operation is performed on an array of
+/// integer values (<tt>short</tt>s are reduced into <tt>int</tt>s) using custom operator.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // custom reduce function
+/// auto min_op =
+///     [] __device__ (int a, int b) -> int
+///     {
+///         return a < b ? a : b;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// unsigned int segments;   // e.g., 3
+/// short * input;           // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
+/// int * output;            // empty array of 3 elements
+/// int * offsets;           // e.g. [0, 2, 3, 8]
+/// int init_value;          // e.g., 9
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output,
+///     segments, offsets, offsets + 1,
+///     min_op, init_value
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform segmented reduction
+/// rocprim::segmented_reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output,
+///     segments, offsets, offsets + 1,
+///     min_op, init_value
+/// );
+/// // output: [4, 6, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class OffsetIterator,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>,
+    class InitValueType = typename std::iterator_traits<InputIterator>::value_type
+>
+inline
+cudaError_t segmented_reduce(void * temporary_storage,
+                            size_t& storage_size,
+                            InputIterator input,
+                            OutputIterator output,
+                            unsigned int segments,
+                            OffsetIterator begin_offsets,
+                            OffsetIterator end_offsets,
+                            BinaryFunction reduce_op = BinaryFunction(),
+                            InitValueType initial_value = InitValueType(),
+                            cudaStream_t stream = 0,
+                            bool debug_synchronous = false)
+{
+    return detail::segmented_reduce_impl<Config>(
+        temporary_storage, storage_size,
+        input, output,
+        segments, begin_offsets, end_offsets,
+        reduce_op, initial_value,
+        stream, debug_synchronous
+    );
+}
+
+/// @}
+// end of group devicemodule
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
--- a/3rdparty/cub/rocprim/device/device_segmented_scan.hpp
+++ b/3rdparty/cub/rocprim/device/device_segmented_scan.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_
+
+#include <type_traits>
+#include <iterator>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../detail/match_result_type.hpp"
+
+#include "../iterator/zip_iterator.hpp"
+#include "../iterator/discard_iterator.hpp"
+#include "../iterator/transform_iterator.hpp"
+#include "../iterator/counting_iterator.hpp"
+#include "../types/tuple.hpp"
+
+#include "device_scan_config.hpp"
+#include "device_scan.hpp"
+#include "detail/device_segmented_scan.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup devicemodule
+/// @{
+
+namespace detail
+{
+
+template<
+    bool Exclusive,
+    class Config,
+    class ResultType,
+    class InputIterator,
+    class OutputIterator,
+    class OffsetIterator,
+    class InitValueType,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
+void segmented_scan_kernel(InputIterator input,
+                           OutputIterator output,
+                           OffsetIterator begin_offsets,
+                           OffsetIterator end_offsets,
+                           InitValueType initial_value,
+                           BinaryFunction scan_op)
+{
+    segmented_scan<Exclusive, Config, ResultType>(
+        input, output, begin_offsets, end_offsets,
+        static_cast<ResultType>(initial_value), scan_op
+    );
+}
+
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+
+template<
+    bool Exclusive,
+    class Config,
+    class InputIterator,
+    class OutputIterator,
+    class OffsetIterator,
+    class InitValueType,
+    class BinaryFunction
+>
+inline
+cudaError_t segmented_scan_impl(void * temporary_storage,
+                               size_t& storage_size,
+                               InputIterator input,
+                               OutputIterator output,
+                               unsigned int segments,
+                               OffsetIterator begin_offsets,
+                               OffsetIterator end_offsets,
+                               const InitValueType initial_value,
+                               BinaryFunction scan_op,
+                               cudaStream_t stream,
+                               bool debug_synchronous)
+{
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    using result_type = typename std::conditional<Exclusive, InitValueType, input_type>::type;
+
+    // Get default config if Config is default_config
+    using config = default_or_custom_config<
+        Config,
+        default_scan_config<ROCPRIM_TARGET_ARCH, result_type>
+    >;
+
+    constexpr unsigned int block_size = config::block_size;
+
+    if(temporary_storage == nullptr)
+    {
+        // Make sure user won't try to allocate 0 bytes memory, because
+        // cudaMalloc will return nullptr when size is zero.
+        storage_size = 4;
+        return cudaSuccess;
+    }
+
+    if( segments == 0u )
+        return cudaSuccess;
+
+    std::chrono::high_resolution_clock::time_point start;
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    segmented_scan_kernel<Exclusive, config, result_type>
+        <<<dim3(segments), dim3(block_size), 0, stream>>>(
+        input, output,
+        begin_offsets, end_offsets,
+        initial_value, scan_op
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_scan", segments, start);
+    return cudaSuccess;
+}
+
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+
+} // end of detail namespace
+
+/// \brief Parallel segmented inclusive scan primitive for device level.
+///
+/// segmented_inclusive_scan function performs a device-wide inclusive scan operation
+/// across multiple sequences from \p input using binary \p scan_op operator.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input and \p output must have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the scan operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to scan.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] scan_op - binary operation function object that will be used for scan.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level segmented inclusive min-scan operation is performed on
+/// an array of integer values (<tt>short</tt>s are scanned into <tt>int</tt>s) using custom operator.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // custom scan function
+/// auto min_op =
+///     [] __device__ (int a, int b) -> int
+///     {
+///         return a < b ? a : b;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// short * input;        // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
+/// int   * output;       // empty array of 8 elements
+/// size_t segments;      // e.g., 3
+/// int * offsets;        // e.g. [0, 2, 4, 8]
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_inclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, segments, offsets, offsets + 1, min_op
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform scan
+/// rocprim::inclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, segments, offsets, offsets + 1, min_op
+/// );
+/// // output: [4, 4, 6, 2, 5, 1, 1, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class OffsetIterator,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
+>
+inline
+cudaError_t segmented_inclusive_scan(void * temporary_storage,
+                                    size_t& storage_size,
+                                    InputIterator input,
+                                    OutputIterator output,
+                                    unsigned int segments,
+                                    OffsetIterator begin_offsets,
+                                    OffsetIterator end_offsets,
+                                    BinaryFunction scan_op = BinaryFunction(),
+                                    cudaStream_t stream = 0,
+                                    bool debug_synchronous = false)
+{
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    using result_type = input_type;
+
+    return detail::segmented_scan_impl<false, Config>(
+        temporary_storage, storage_size,
+        input, output, segments, begin_offsets, end_offsets, result_type(),
+        scan_op, stream, debug_synchronous
+    );
+}
+
+/// \brief Parallel segmented exclusive scan primitive for device level.
+///
+/// segmented_exclusive_scan function performs a device-wide exclusive scan operation
+/// across multiple sequences from \p input using binary \p scan_op operator.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input and \p output must have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam InitValueType - type of the initial value.
+/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the scan operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to scan.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] initial_value - initial value to start the scan.
+/// \param [in] scan_op - binary operation function object that will be used for scan.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level segmented exclusive min-scan operation is performed on
+/// an array of integer values (<tt>short</tt>s are scanned into <tt>int</tt>s) using custom operator.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // custom scan function
+/// auto min_op =
+///     [] __device__ (int a, int b) -> int
+///     {
+///         return a < b ? a : b;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// int start_value;      // e.g., 9
+/// short * input;        // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
+/// int   * output;       // empty array of 8 elements
+/// size_t segments;      // e.g., 3
+/// int * offsets;        // e.g. [0, 2, 4, 8]
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_exclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, segments, offsets, offsets + 1
+///     start_value, min_op
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform scan
+/// rocprim::exclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, segments, offsets, offsets + 1
+///     start_value, min_op
+/// );
+/// // output: [9, 4, 9, 6, 9, 5, 1, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class OffsetIterator,
+    class InitValueType,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
+>
+inline
+cudaError_t segmented_exclusive_scan(void * temporary_storage,
+                                    size_t& storage_size,
+                                    InputIterator input,
+                                    OutputIterator output,
+                                    unsigned int segments,
+                                    OffsetIterator begin_offsets,
+                                    OffsetIterator end_offsets,
+                                    const InitValueType initial_value,
+                                    BinaryFunction scan_op = BinaryFunction(),
+                                    cudaStream_t stream = 0,
+                                    bool debug_synchronous = false)
+{
+    return detail::segmented_scan_impl<true, Config>(
+        temporary_storage, storage_size,
+        input, output, segments, begin_offsets, end_offsets, initial_value,
+        scan_op, stream, debug_synchronous
+    );
+}
+
+/// \brief Parallel segmented inclusive scan primitive for device level.
+///
+/// segmented_inclusive_scan function performs a device-wide inclusive scan operation
+/// across multiple sequences from \p input using binary \p scan_op operator. Beginnings
+/// of the segments should be marked by value convertible to \p true at corresponding
+/// position in \p flags range.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements.
+/// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam HeadFlagIterator - random-access iterator type of flags. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the scan operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to scan.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [in] head_flags - iterator to the first element in the range of head flags marking
+/// beginnings of each segment in the input range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] scan_op - binary operation function object that will be used for scan.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level segmented inclusive sum operation is performed on
+/// an array of integer values (<tt>short</tt>s are added into <tt>int</tt>s).
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t size;      // e.g., 8
+/// short * input;    // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int * flags;      // e.g., [1, 0, 0, 1, 0, 1, 0, 0]
+/// int * output;     // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_inclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, flags, size, ::rocprim::plus<int>()
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform scan
+/// rocprim::inclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, flags, size, ::rocprim::plus<int>()
+/// );
+/// // output: [1, 3, 6, 4, 9, 6, 13, 21]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class HeadFlagIterator,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
+>
+inline
+cudaError_t segmented_inclusive_scan(void * temporary_storage,
+                                    size_t& storage_size,
+                                    InputIterator input,
+                                    OutputIterator output,
+                                    HeadFlagIterator head_flags,
+                                    size_t size,
+                                    BinaryFunction scan_op = BinaryFunction(),
+                                    cudaStream_t stream = 0,
+                                    bool debug_synchronous = false)
+{
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    using result_type = input_type;
+    using flag_type = typename std::iterator_traits<HeadFlagIterator>::value_type;
+    using headflag_scan_op_wrapper_type =
+        detail::headflag_scan_op_wrapper<
+            result_type, flag_type, BinaryFunction
+        >;
+
+    return inclusive_scan<Config>(
+        temporary_storage, storage_size,
+        rocprim::make_zip_iterator(rocprim::make_tuple(input, head_flags)),
+        rocprim::make_zip_iterator(rocprim::make_tuple(output, rocprim::make_discard_iterator())),
+        size, headflag_scan_op_wrapper_type(scan_op),
+        stream, debug_synchronous
+    );
+}
+
+/// \brief Parallel segmented exclusive scan primitive for device level.
+///
+/// segmented_exclusive_scan function performs a device-wide exclusive scan operation
+/// across multiple sequences from \p input using binary \p scan_op operator. Beginnings
+/// of the segments should be marked by value convertible to \p true at corresponding
+/// position in \p flags range.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements.
+/// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam HeadFlagIterator - random-access iterator type of flags. Must meet the
+/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
+/// \tparam InitValueType - type of the initial value.
+/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the scan operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to scan.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [in] head_flags - iterator to the first element in the range of head flags marking
+/// beginnings of each segment in the input range.
+/// \param [in] initial_value - initial value to start the scan.
+/// \param [in] size - number of element in the input range.
+/// \param [in] scan_op - binary operation function object that will be used for scan.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level segmented exclusive sum operation is performed on
+/// an array of integer values (<tt>short</tt>s are added into <tt>int</tt>s).
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t size;      // e.g., 8
+/// short * input;    // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int * flags;      // e.g., [1, 0, 0, 1, 0, 1, 0, 0]
+/// int init;         // e.g., 9
+/// int * output;     // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_exclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, flags, init, size, ::rocprim::plus<int>()
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform scan
+/// rocprim::exclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, flags, init, size, ::rocprim::plus<int>()
+/// );
+/// // output: [9, 10, 12, 9, 13, 9, 15, 22]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class InitValueType,
+    class HeadFlagIterator,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
+>
+inline
+cudaError_t segmented_exclusive_scan(void * temporary_storage,
+                                    size_t& storage_size,
+                                    InputIterator input,
+                                    OutputIterator output,
+                                    HeadFlagIterator head_flags,
+                                    const InitValueType initial_value,
+                                    size_t size,
+                                    BinaryFunction scan_op = BinaryFunction(),
+                                    cudaStream_t stream = 0,
+                                    bool debug_synchronous = false)
+{
+    using result_type = InitValueType;
+    using flag_type = typename std::iterator_traits<HeadFlagIterator>::value_type;
+    using headflag_scan_op_wrapper_type =
+        detail::headflag_scan_op_wrapper<
+            result_type, flag_type, BinaryFunction
+        >;
+
+    const result_type initial_value_converted = static_cast<result_type>(initial_value);
+
+    // Flag the last item of each segment as the next segment's head, use initial_value as its value,
+    // then run exclusive scan
+    return exclusive_scan<Config>(
+        temporary_storage, storage_size,
+        rocprim::make_transform_iterator(
+            rocprim::make_counting_iterator<size_t>(0),
+            [input, head_flags, initial_value_converted, size]
+            ROCPRIM_DEVICE
+            (const size_t i)
+            {
+                flag_type flag(false);
+                if(i + 1 < size)
+                {
+                    flag = head_flags[i + 1];
+                }
+                result_type value = initial_value_converted;
+                if(!flag)
+                {
+                    value = input[i];
+                }
+                return rocprim::make_tuple(value, flag);
+            }
+        ),
+        rocprim::make_zip_iterator(rocprim::make_tuple(output, rocprim::make_discard_iterator())),
+        rocprim::make_tuple(initial_value_converted, flag_type(true)), // init value is a head of the first segment
+        size,
+        headflag_scan_op_wrapper_type(scan_op),
+        stream,
+        debug_synchronous
+    );
+}
+
+/// @}
+// end of group devicemodule
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_
--- a/3rdparty/cub/rocprim/device/device_select.hpp
+++ b/3rdparty/cub/rocprim/device/device_select.hpp
+// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
+
+#include <type_traits>
+#include <iterator>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../detail/binary_op_wrappers.hpp"
+
+#include "../iterator/transform_iterator.hpp"
+
+#include "device_scan.hpp"
+#include "device_partition.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup devicemodule
+/// @{
+
+namespace detail
+{
+
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        if(error != cudaSuccess) return error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto error = cudaStreamSynchronize(stream); \
+            if(error != cudaSuccess) return error; \
+            auto end = std::chrono::high_resolution_clock::now(); \
+            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
+            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+
+} // end detail namespace
+
+/// \brief Parallel select primitive for device level using range of flags.
+///
+/// Performs a device-wide selection based on input \p flags. If a value from \p input
+/// should be selected and copied into \p output range the corresponding item from
+/// \p flags range should be set to such value that can be implicitly converted to
+/// \p true (\p bool type).
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input and \p flags must have at least \p size elements.
+/// * Range specified by \p output must have at least so many elements, that all positively
+/// flagged values can be copied into it.
+/// * Range specified by \p selected_count_output must have at least 1 element.
+/// * Values of \p flag range should be implicitly convertible to `bool` type.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam FlagIterator - random-access iterator type of the flag range. It can be
+/// a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. It can be
+/// a simple pointer type.
+/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
+/// value. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the select operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to select values from.
+/// \param [in] flags - iterator to the selection flag corresponding to the first element from \p input range.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
+/// \param [in] size - number of element in the input range.
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level select operation is performed on an array of
+/// integer values with array of <tt>char</tt>s used as flags.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;     // e.g., 8
+/// int * input;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// char * flags;          // e.g., [0, 1, 1, 0, 0, 1, 0, 1]
+/// int * output;          // empty array of 8 elements
+/// size_t * output_count; // empty array of 1 element
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::select(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, flags,
+///     output, output_count,
+///     input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform selection
+/// rocprim::select(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, flags,
+///     output, output_count,
+///     input_size
+/// );
+/// // output: [2, 3, 6, 8]
+/// // output_count: 4
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class FlagIterator,
+    class OutputIterator,
+    class SelectedCountOutputIterator
+>
+inline
+cudaError_t select(void * temporary_storage,
+                  size_t& storage_size,
+                  InputIterator input,
+                  FlagIterator flags,
+                  OutputIterator output,
+                  SelectedCountOutputIterator selected_count_output,
+                  const size_t size,
+                  const cudaStream_t stream = 0,
+                  const bool debug_synchronous = false)
+{
+    // Dummy unary predicate
+    using unary_predicate_type = ::rocprim::empty_type;
+    // Dummy inequality operation
+    using inequality_op_type = ::rocprim::empty_type;
+    using offset_type = unsigned int;
+    rocprim::empty_type* const no_values = nullptr; // key only
+
+    return detail::partition_impl<detail::select_method::flag, true, Config, offset_type>(
+        temporary_storage, storage_size, input, no_values, flags, output, no_values, selected_count_output,
+        size, inequality_op_type(), stream, debug_synchronous, unary_predicate_type()
+    );
+}
+
+/// \brief Parallel select primitive for device level using selection operator.
+///
+/// Performs a device-wide selection using selection operator. If a value \p x from \p input
+/// should be selected and copied into \p output range, then <tt>predicate(x)</tt> has to
+/// return \p true.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Range specified by \p input must have at least \p size elements.
+/// * Range specified by \p output must have at least so many elements, that all selected
+/// values can be copied into it.
+/// * Range specified by \p selected_count_output must have at least 1 element.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. It can be
+/// a simple pointer type.
+/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
+/// value. It can be a simple pointer type.
+/// \tparam UnaryPredicate - type of a unary selection predicate.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the select operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to select values from.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
+/// \param [in] size - number of element in the input range.
+/// \param [in] predicate - unary function object that will be used for selecting values.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the object passed to it.
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level select operation is performed on an array of
+/// integer values, only even values are selected.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// auto predicate =
+///     [] __device__ (int a) -> bool
+///     {
+///         return (a%2) == 0;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;     // e.g., 8
+/// int * input;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int * output;          // empty array of 8 elements
+/// size_t * output_count; // empty array of 1 element
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::select(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, output_count,
+///     predicate, input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform selection
+/// rocprim::select(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, output_count,
+///     predicate, input_size
+/// );
+/// // output: [2, 4, 6, 8]
+/// // output_count: 4
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class SelectedCountOutputIterator,
+    class UnaryPredicate
+>
+inline
+cudaError_t select(void * temporary_storage,
+                  size_t& storage_size,
+                  InputIterator input,
+                  OutputIterator output,
+                  SelectedCountOutputIterator selected_count_output,
+                  const size_t size,
+                  UnaryPredicate predicate,
+                  const cudaStream_t stream = 0,
+                  const bool debug_synchronous = false)
+{
+    // Dummy flag type
+    using flag_type = ::rocprim::empty_type;
+    using offset_type = unsigned int;
+    flag_type * flags = nullptr;
+    // Dummy inequality operation
+    using inequality_op_type = ::rocprim::empty_type;
+    rocprim::empty_type* const no_values = nullptr; // key only
+
+    return detail::partition_impl<detail::select_method::predicate, true, Config, offset_type>(
+        temporary_storage, storage_size, input, no_values, flags, output, no_values, selected_count_output,
+        size, inequality_op_type(), stream, debug_synchronous, predicate
+    );
+}
+
+/// \brief Device-level parallel unique primitive.
+///
+/// From given \p input range unique primitive eliminates all but the first element from every
+/// consecutive group of equivalent elements and copies them into \p output.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage is a null pointer.
+/// * Range specified by \p input must have at least \p size elements.
+/// * Range specified by \p output must have at least so many elements, that all selected
+/// values can be copied into it.
+/// * Range specified by \p unique_count_output must have at least 1 element.
+/// * By default <tt>InputIterator::value_type</tt>'s equality operator is used to check
+/// if elements are equivalent.
+///
+/// \tparam InputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. It can be
+/// a simple pointer type.
+/// \tparam UniqueCountOutputIterator - random-access iterator type of the unique_count_output
+/// value used to return number of unique values. It can be a simple pointer type.
+/// \tparam EqualityOp - type of an binary operator used to compare values for equality.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the unique operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to select values from.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [out] unique_count_output - iterator to the total number of selected values (length of \p output).
+/// \param [in] size - number of element in the input range.
+/// \param [in] equality_op - [optional] binary function object used to compare input values for equality.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool equal_to(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the object passed to it.
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level unique operation is performed on an array of integer values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;     // e.g., 8
+/// int * input;           // e.g., [1, 4, 2, 4, 4, 7, 7, 7]
+/// int * output;          // empty array of 8 elements
+/// size_t * output_count; // empty array of 1 element
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::unique(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, output_count,
+///     input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform unique operation
+/// rocprim::unique(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, output_count,
+///     input_size
+/// );
+/// // output: [1, 4, 2, 4, 7]
+/// // output_count: 5
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class UniqueCountOutputIterator,
+    class EqualityOp = ::rocprim::equal_to<typename std::iterator_traits<InputIterator>::value_type>
+>
+inline
+cudaError_t unique(void * temporary_storage,
+                  size_t& storage_size,
+                  InputIterator input,
+                  OutputIterator output,
+                  UniqueCountOutputIterator unique_count_output,
+                  const size_t size,
+                  EqualityOp equality_op = EqualityOp(),
+                  const cudaStream_t stream = 0,
+                  const bool debug_synchronous = false)
+{
+    // Dummy unary predicate
+    using unary_predicate_type = ::rocprim::empty_type;
+    using offset_type = unsigned int;
+    // Dummy flag type
+    using flag_type = ::rocprim::empty_type;
+    const flag_type * flags = nullptr;
+    rocprim::empty_type* const no_values = nullptr; // key only
+
+    // Convert equality operator to inequality operator
+    auto inequality_op = detail::inequality_wrapper<EqualityOp>(equality_op);
+
+    return detail::partition_impl<detail::select_method::unique, true, Config, offset_type>(
+        temporary_storage, storage_size, input, no_values, flags, output, no_values, unique_count_output,
+        size, inequality_op, stream, debug_synchronous, unary_predicate_type()
+    );
+}
+
+/// \brief Device-level parallel unique by key primitive.
+///
+/// From given \p input range unique primitive eliminates all but the first element from every
+/// consecutive group of equivalent elements and copies them and their corresponding keys into
+/// \p output.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage is a null pointer.
+/// * Ranges specified by \p keys_input and value_input must have at least \p size elements each.
+/// * Ranges specified by \p keys_output and values_output each must have at least so many elements,
+/// that all selected values can be copied into them.
+/// * Range specified by \p unique_count_output must have at least 1 element.
+/// * By default <tt>InputIterator::value_type</tt>'s equality operator is used to check
+/// if elements are equivalent.
+///
+/// \tparam KeyIterator - random-access iterator type of the input key range. It can be
+/// a simple pointer type.
+/// \tparam ValueIterator - random-access iterator type of the input value range. It can be
+/// a simple pointer type.
+/// \tparam OutputKeyIterator - random-access iterator type of the output key range. It can be
+/// a simple pointer type.
+/// \tparam OutputValueIterator - random-access iterator type of the output value range. It can be
+/// a simple pointer type.
+/// \tparam UniqueCountOutputIterator - random-access iterator type of the unique_count_output
+/// value used to return number of unique keys and values. It can be a simple pointer type.
+/// \tparam EqualityOp - type of an binary operator used to compare keys for equality.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the unique operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - iterator to the first element in the range to select keys from.
+/// \param [in] values_input - iterator to the first element in the range of values corresponding to keys
+/// \param [out] keys_output - iterator to the first element in the output key range.
+/// \param [out] values_output - iterator to the first element in the output value range.
+/// \param [out] unique_count_output - iterator to the total number of selected values (length of \p output).
+/// \param [in] size - number of element in the input range.
+/// \param [in] equality_op - [optional] binary function object used to compare input values for equality.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool equal_to(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the object passed to it.
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+template <typename Config = default_config,
+          typename KeyIterator,
+          typename ValueIterator,
+          typename OutputKeyIterator,
+          typename OutputValueIterator,
+          typename UniqueCountOutputIterator,
+          typename EqualityOp
+          = ::rocprim::equal_to<typename std::iterator_traits<KeyIterator>::value_type>>
+inline cudaError_t unique_by_key(void*                           temporary_storage,
+                                size_t&                         storage_size,
+                                const KeyIterator               keys_input,
+                                const ValueIterator             values_input,
+                                const OutputKeyIterator         keys_output,
+                                const OutputValueIterator       values_output,
+                                const UniqueCountOutputIterator unique_count_output,
+                                const size_t                    size,
+                                const EqualityOp                equality_op       = EqualityOp(),
+                                const cudaStream_t               stream            = 0,
+                                const bool                      debug_synchronous = false)
+{
+    using offset_type = unsigned int;
+    // Dummy flag
+    ::rocprim::empty_type* const no_flags = nullptr;
+    // Dummy predicate
+    const auto no_predicate = ::rocprim::empty_type{};
+
+    // Convert equality operator to inequality operator
+    const auto inequality_op = detail::inequality_wrapper<EqualityOp>(equality_op);
+
+    return detail::partition_impl<detail::select_method::unique, true, Config, offset_type>(
+        temporary_storage,
+        storage_size,
+        keys_input,
+        values_input,
+        no_flags,
+        keys_output,
+        values_output,
+        unique_count_output,
+        size,
+        inequality_op,
+        stream,
+        debug_synchronous,
+        no_predicate);
+}
+
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+
+/// @}
+// end of group devicemodule
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
--- a/3rdparty/cub/rocprim/device/device_select_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_select_config.hpp
+// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DEVICE_SELECT_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SELECT_CONFIG_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../block/block_load.hpp"
+#include "../block/block_scan.hpp"
+
+#include "config_types.hpp"
+
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief Configuration of device-level select operation.
+///
+/// \tparam BlockSize - number of threads in a block.
+/// \tparam ItemsPerThread - number of items processed by each thread.
+/// \tparam KeyBlockLoadMethod - method for loading input keys.
+/// \tparam ValueBlockLoadMethod - method for loading input values.
+/// \tparam FlagBlockLoadMethod - method for loading flag values.
+/// \tparam BlockScanMethod - algorithm for block scan.
+/// \tparam SizeLimit - limit on the number of items for a single select kernel launch.
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    ::rocprim::block_load_method KeyBlockLoadMethod,
+    ::rocprim::block_load_method ValueBlockLoadMethod,
+    ::rocprim::block_load_method FlagBlockLoadMethod,
+    ::rocprim::block_scan_algorithm BlockScanMethod,
+    unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT
+>
+struct select_config
+{
+    /// \brief Number of threads in a block.
+    static constexpr unsigned int block_size = BlockSize;
+    /// \brief Number of items processed by each thread.
+    static constexpr unsigned int items_per_thread = ItemsPerThread;
+    /// \brief Method for loading input keys.
+    static constexpr block_load_method key_block_load_method = KeyBlockLoadMethod;
+    /// \brief Method for loading input values.
+    static constexpr block_load_method value_block_load_method = ValueBlockLoadMethod;
+    /// \brief Method for loading flag values.
+    static constexpr block_load_method flag_block_load_method = FlagBlockLoadMethod;
+    /// \brief Algorithm for block scan.
+    static constexpr block_scan_algorithm block_scan_method = BlockScanMethod;
+    /// \brief Limit on the number of items for a single select kernel launch.
+    static constexpr unsigned int size_limit = SizeLimit;
+};
+
+namespace detail
+{
+
+template<class Key>
+struct select_config_803
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
+
+    using type = select_config<
+        limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 13u / item_scale),
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+
+template<class Key>
+struct select_config_900
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
+
+    using type = select_config<
+        limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 15u / item_scale),
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+
+template<class Value>
+struct select_config_90a
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+
+    using type = select_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 15u / item_scale),
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+
+template<class Value>
+struct select_config_1030
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+
+    using type = select_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
+        ::rocprim::max(1u, 15u / item_scale),
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+
+
+template<unsigned int TargetArch, class Key, class /*Value*/>
+struct default_select_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<803, select_config_803<Key>>,
+        select_arch_case<900, select_config_900<Key>>,
+        select_arch_case<ROCPRIM_ARCH_90a, select_config_90a<Key>>,
+        select_arch_case<1030, select_config_1030<Key>>,
+        select_config_803<Key>
+    > { };
+
+} // end namespace detail
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group primitivesmodule_deviceconfigs
+
+#endif // ROCPRIM_DEVICE_DEVICE_SELECT_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_transform.hpp
+++ b/3rdparty/cub/rocprim/device/device_transform.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
+#define ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
+
+#include <algorithm>
+#include <type_traits>
+#include <iterator>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../detail/match_result_type.hpp"
+#include "../types/tuple.hpp"
+#include "../iterator/zip_iterator.hpp"
+
+#include "device_transform_config.hpp"
+#include "detail/device_transform.hpp"
+#include <chrono>
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup devicemodule
+/// @{
+
+namespace detail
+{
+
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class ResultType,
+    class InputIterator,
+    class OutputIterator,
+    class UnaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void transform_kernel(InputIterator input,
+                      const size_t size,
+                      OutputIterator output,
+                      UnaryFunction transform_op)
+{
+    transform_kernel_impl<BlockSize, ItemsPerThread, ResultType>(
+        input, size, output, transform_op
+    );
+}
+
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            _error = cudaStreamSynchronize(stream); \
+            if(_error != cudaSuccess) return _error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+
+} // end of detail namespace
+
+/// \brief Parallel transform primitive for device level.
+///
+/// transform function performs a device-wide transformation operation
+/// using unary \p transform_op operator.
+///
+/// \par Overview
+/// * Ranges specified by \p input and \p output must have at least \p size elements.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam UnaryFunction - type of unary function used for transform.
+///
+/// \param [in] input - iterator to the first element in the range to transform.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] transform_op - unary operation function object that will be used for transform.
+/// The signature of the function should be equivalent to the following:
+/// <tt>U f(const T &a);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the object passed to it.
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level transform operation is performed on an array of
+/// integer values (<tt>short</tt>s are transformed into <tt>int</tt>s).
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // custom transform function
+/// auto transform_op =
+///     [] __device__ (int a) -> int
+///     {
+///         return a + 5;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;    // e.g., 8
+/// short * input;        // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int * output;         // empty array of 8 elements
+///
+/// // perform transform
+/// rocprim::transform(
+///     input, output, input_size, transform_op
+/// );
+/// // output: [6, 7, 8, 9, 10, 11, 12, 13]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class UnaryFunction
+>
+inline
+cudaError_t transform(InputIterator input,
+                     OutputIterator output,
+                     const size_t size,
+                     UnaryFunction transform_op,
+                     const cudaStream_t stream = 0,
+                     bool debug_synchronous = false)
+{
+    if( size == size_t(0) )
+        return cudaSuccess;
+
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    using result_type = typename ::rocprim::detail::invoke_result<UnaryFunction, input_type>::type;
+
+    // Get default config if Config is default_config
+    using config = detail::default_or_custom_config<
+        Config,
+        detail::default_transform_config<ROCPRIM_TARGET_ARCH, result_type>
+    >;
+
+    static constexpr unsigned int block_size = config::block_size;
+    static constexpr unsigned int items_per_thread = config::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
+
+    // Start point for time measurements
+    std::chrono::high_resolution_clock::time_point start;
+
+    static constexpr auto size_limit = config::size_limit;
+    static constexpr auto number_of_blocks_limit
+        = ::rocprim::max<size_t>(size_limit / items_per_block, 1);
+
+    auto number_of_blocks = (size + items_per_block - 1)/items_per_block;
+    if(debug_synchronous)
+    {
+        std::cout << "block_size " << block_size << '\n';
+        std::cout << "number of blocks " << number_of_blocks << '\n';
+        std::cout << "number of blocks limit " << number_of_blocks_limit << '\n';
+        std::cout << "items_per_block " << items_per_block << '\n';
+    }
+
+    static constexpr auto aligned_size_limit = number_of_blocks_limit * items_per_block;
+
+    // Launch number_of_blocks_limit blocks while there is still at least as many blocks left as the limit
+    const auto number_of_launch = (size + aligned_size_limit - 1) / aligned_size_limit;
+    for(size_t i = 0, offset = 0; i < number_of_launch; ++i, offset += aligned_size_limit) {
+        const auto current_size = std::min(size - offset, aligned_size_limit);
+        const auto current_blocks = (current_size + items_per_block - 1) / items_per_block;
+
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+        detail::transform_kernel<
+                block_size, items_per_thread, result_type,
+                InputIterator, OutputIterator, UnaryFunction
+            >
+            <<<dim3(current_blocks), dim3(block_size), 0, stream>>>(
+            input + offset, current_size, output + offset, transform_op
+        );
+        ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("transform_kernel", current_size, start);
+    }
+
+    return cudaSuccess;
+}
+
+/// \brief Parallel device-level transform primitive for two inputs.
+///
+/// transform function performs a device-wide transformation operation
+/// on two input ranges using binary \p transform_op operator.
+///
+/// \par Overview
+/// * Ranges specified by \p input1, \p input2, and \p output must have at least \p size elements.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or
+/// a custom class with the same members.
+/// \tparam InputIterator1 - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam InputIterator2 - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam BinaryFunction - type of binary function used for transform.
+///
+/// \param [in] input1 - iterator to the first element in the 1st range to transform.
+/// \param [in] input2 - iterator to the first element in the 2nd range to transform.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] transform_op - binary operation function object that will be used for transform.
+/// The signature of the function should be equivalent to the following:
+/// <tt>U f(const T1& a, const T2& b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the object passed to it.
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced. Default value is \p false.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level transform operation is performed on two arrays of
+/// integer values (element-wise sum is performed).
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // custom transform function
+/// auto transform_op =
+///     [] __device__ (int a, int b) -> int
+///     {
+///         return a + b;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t size;   // e.g., 8
+/// int* input1;   // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int* input2;   // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int* output;   // empty array of 8 elements
+///
+/// // perform transform
+/// rocprim::transform(
+///     input1, input2, output, input1.size(), transform_op
+/// );
+/// // output: [2, 4, 6, 8, 10, 12, 14, 16]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator1,
+    class InputIterator2,
+    class OutputIterator,
+    class BinaryFunction
+>
+inline
+cudaError_t transform(InputIterator1 input1,
+                     InputIterator2 input2,
+                     OutputIterator output,
+                     const size_t size,
+                     BinaryFunction transform_op,
+                     const cudaStream_t stream = 0,
+                     bool debug_synchronous = false)
+{
+    using value_type1 = typename std::iterator_traits<InputIterator1>::value_type;
+    using value_type2 = typename std::iterator_traits<InputIterator2>::value_type;
+    return transform<Config>(
+        ::rocprim::make_zip_iterator(::rocprim::make_tuple(input1, input2)), output,
+        size, detail::unpack_binary_op<value_type1, value_type2, BinaryFunction>(transform_op),
+        stream, debug_synchronous
+    );
+}
+
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+
+/// @}
+// end of group devicemodule
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
--- a/3rdparty/cub/rocprim/device/device_transform_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_transform_config.hpp
+// Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DEVICE_TRANSFORM_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_TRANSFORM_CONFIG_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../functional.hpp"
+#include "../detail/various.hpp"
+
+#include "config_types.hpp"
+
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief Configuration of device-level transform primitives.
+template <unsigned int BlockSize,
+          unsigned int ItemsPerThread,
+          unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
+using transform_config = kernel_config<BlockSize, ItemsPerThread, SizeLimit>;
+
+namespace detail
+{
+
+template<class Value>
+struct transform_config_803
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+
+    using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>;
+};
+
+template<class Value>
+struct transform_config_900
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+
+    using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>;
+};
+
+template<class Value>
+struct transform_config_90a
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+
+    using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>;
+};
+
+template<class Value>
+struct transform_config_1030
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+
+    using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>;
+};
+
+template<unsigned int TargetArch, class Value>
+struct default_transform_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<803, transform_config_803<Value>>,
+        select_arch_case<900, transform_config_900<Value>>,
+        select_arch_case<ROCPRIM_ARCH_90a, transform_config_90a<Value>>,
+        select_arch_case<1030, transform_config_1030<Value>>,
+        transform_config_900<Value>
+    > { };
+
+} // end namespace detail
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group primitivesmodule_deviceconfigs
+
+#endif // ROCPRIM_DEVICE_DEVICE_TRANSFORM_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/specialization/device_radix_merge_sort.hpp
+++ b/3rdparty/cub/rocprim/device/specialization/device_radix_merge_sort.hpp
+// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_MERGE_SORT_HPP_
+#define ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_MERGE_SORT_HPP_
+
+#include "../detail/device_radix_sort.hpp"
+#include "../specialization/device_radix_single_sort.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+    template<
+        unsigned int BlockSize,
+        unsigned int ItemsPerThread,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator,
+        class BinaryFunction
+    >
+    ROCPRIM_KERNEL
+   __launch_bounds__(BlockSize)
+   void radix_block_merge_kernel(KeysInputIterator   keys_input,
+                                KeysOutputIterator   keys_output,
+                                ValuesInputIterator  values_input,
+                                ValuesOutputIterator values_output,
+                                const size_t         input_size,
+                                const unsigned int   merge_items_per_block_size,
+                                BinaryFunction       compare_function)
+   {
+       radix_block_merge_impl<BlockSize, ItemsPerThread>(
+           keys_input, keys_output,
+           values_input, values_output,
+           input_size, merge_items_per_block_size,
+           compare_function
+       );
+   }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_merge(KeysInputIterator keys_input,
+                                typename std::iterator_traits<KeysInputIterator>::value_type * keys_buffer,
+                                KeysOutputIterator keys_output,
+                                ValuesInputIterator values_input,
+                                typename std::iterator_traits<ValuesInputIterator>::value_type * values_buffer,
+                                ValuesOutputIterator values_output,
+                                unsigned int size,
+                                unsigned int bit,
+                                unsigned int end_bit,
+                                cudaStream_t stream,
+                                bool debug_synchronous)
+    {
+        using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+        using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
+
+        constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
+
+        constexpr unsigned int items_per_thread = Config::sort_merge::items_per_thread;
+        constexpr unsigned int block_size = Config::sort_merge::block_size;
+        constexpr unsigned int items_per_block = block_size * items_per_thread;
+
+        const unsigned int current_radix_bits = end_bit - bit;
+        auto number_of_blocks = (size + items_per_block - 1) / items_per_block;
+
+        std::chrono::high_resolution_clock::time_point start;
+        if(debug_synchronous)
+        {
+            std::cout << "block size " << block_size << '\n';
+            std::cout << "items per thread " << items_per_thread << '\n';
+            std::cout << "number of blocks " << number_of_blocks << '\n';
+            std::cout << "bit " << bit << '\n';
+            std::cout << "current_radix_bits " << current_radix_bits << '\n';
+        }
+
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+
+        sort_single_kernel<
+                block_size, items_per_thread , Descending
+            >
+            <<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
+            keys_input, keys_buffer, values_input, values_buffer,
+            size, bit, current_radix_bits
+        );
+        ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("radix_sort_single", size, start)
+
+        bool temporary_store = true;
+        for(unsigned int block = items_per_block; block < size; block *= 2)
+        {
+            temporary_store = !temporary_store;
+            if(temporary_store)
+            {
+                if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+                if( current_radix_bits == sizeof(key_type) * 8 )
+                {
+                    radix_block_merge_kernel<block_size, items_per_thread>
+                        <<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
+                        keys_output, keys_buffer, values_output, values_buffer,
+                        size, block, radix_merge_compare<Descending, false, key_type>()
+                    );
+                }
+                else
+                {
+                    radix_block_merge_kernel<block_size, items_per_thread>
+                        <<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
+                        keys_output, keys_buffer, values_output, values_buffer,
+                        size, block, radix_merge_compare<Descending, true, key_type>(bit, current_radix_bits)
+                    );
+                }
+                ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("radix_block_merge_kernel", size, start);
+            }
+            else
+            {
+                if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+                if( current_radix_bits == sizeof(key_type) * 8 )
+                {
+                    radix_block_merge_kernel<block_size, items_per_thread>
+                        <<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
+                        keys_buffer, keys_output, values_buffer, values_output,
+                        size, block, radix_merge_compare<Descending, false, key_type>()
+                    );
+                }
+                else
+                {
+                    radix_block_merge_kernel<block_size, items_per_thread>
+                        <<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
+                        keys_buffer, keys_output, values_buffer, values_output,
+                        size, block, radix_merge_compare<Descending, true, key_type>(bit, current_radix_bits)
+                    );
+                }
+                ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("radix_block_merge_kernel", size, start);
+            }
+        }
+
+        if(temporary_store)
+        {
+            cudaError_t error = ::rocprim::transform(
+                keys_buffer, keys_output, size,
+                ::rocprim::identity<key_type>(), stream, debug_synchronous
+            );
+            if(error != cudaSuccess) return error;
+
+            if(with_values)
+            {
+                cudaError_t error = ::rocprim::transform(
+                    values_buffer, values_output, size,
+                    ::rocprim::identity<value_type>(), stream, debug_synchronous
+                );
+                if(error != cudaSuccess) return error;
+            }
+        }
+
+        return cudaSuccess;
+    }
+} // end namespace detail
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_MERGE_SORT_HPP_
--- a/3rdparty/cub/rocprim/device/specialization/device_radix_single_sort.hpp
+++ b/3rdparty/cub/rocprim/device/specialization/device_radix_single_sort.hpp
+// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_SINGLE_SORT_HPP_
+#define ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_SINGLE_SORT_HPP_
+
+#include "../detail/device_radix_sort.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+
+    template<
+        unsigned int BlockSize,
+        unsigned int ItemsPerThread,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    ROCPRIM_KERNEL
+   __launch_bounds__(BlockSize)
+   void sort_single_kernel(KeysInputIterator    keys_input,
+                           KeysOutputIterator   keys_output,
+                           ValuesInputIterator  values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int         size,
+                           unsigned int         bit,
+                           unsigned int         current_radix_bits)
+   {
+       sort_single<BlockSize, ItemsPerThread, Descending>(
+           keys_input, keys_output,
+           values_input, values_output,
+           size, bit, current_radix_bits
+       );
+   }
+
+    template<
+        unsigned int BlockSize,
+        unsigned int ItemsPerThread,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single(KeysInputIterator keys_input,
+                                KeysOutputIterator keys_output,
+                                ValuesInputIterator values_input,
+                                ValuesOutputIterator values_output,
+                                unsigned int size,
+                                unsigned int bit,
+                                unsigned int end_bit,
+                                cudaStream_t stream,
+                                bool debug_synchronous)
+    {
+        const unsigned int current_radix_bits = end_bit - bit;
+
+        std::chrono::high_resolution_clock::time_point start;
+        if(debug_synchronous)
+        {
+            std::cout << "BlockSize " << BlockSize << '\n';
+            std::cout << "ItemsPerThread " << ItemsPerThread << '\n';
+            std::cout << "bit " << bit << '\n';
+            std::cout << "current_radix_bits " << current_radix_bits << '\n';
+        }
+
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+
+        sort_single_kernel<
+                BlockSize, ItemsPerThread, Descending
+            >
+            <<<dim3(1), dim3(BlockSize), 0, stream>>>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, current_radix_bits
+        );
+        ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("radix_sort_single", size, start)
+
+        return cudaSuccess;
+    }
+
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit64(KeysInputIterator keys_input,
+                                         KeysOutputIterator keys_output,
+                                         ValuesInputIterator values_input,
+                                         ValuesOutputIterator values_output,
+                                         unsigned int size,
+                                         unsigned int bit,
+                                         unsigned int end_bit,
+                                         cudaStream_t stream,
+                                         bool debug_synchronous)
+    {
+        return radix_sort_single<64U, 1U, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit128(KeysInputIterator keys_input,
+                                          KeysOutputIterator keys_output,
+                                          ValuesInputIterator values_input,
+                                          ValuesOutputIterator values_output,
+                                          unsigned int size,
+                                          unsigned int bit,
+                                          unsigned int end_bit,
+                                          cudaStream_t stream,
+                                          bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 64U )
+            return radix_sort_single_limit64<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<64U, 2U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit192(KeysInputIterator keys_input,
+                                          KeysOutputIterator keys_output,
+                                          ValuesInputIterator values_input,
+                                          ValuesOutputIterator values_output,
+                                          unsigned int size,
+                                          unsigned int bit,
+                                          unsigned int end_bit,
+                                          cudaStream_t stream,
+                                          bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 128U )
+            return radix_sort_single_limit128<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<64U, 3U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit256(KeysInputIterator keys_input,
+                                          KeysOutputIterator keys_output,
+                                          ValuesInputIterator values_input,
+                                          ValuesOutputIterator values_output,
+                                          unsigned int size,
+                                          unsigned int bit,
+                                          unsigned int end_bit,
+                                          cudaStream_t stream,
+                                          bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 192U )
+            return radix_sort_single_limit192<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<64U, 4U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit320(KeysInputIterator keys_input,
+                                          KeysOutputIterator keys_output,
+                                          ValuesInputIterator values_input,
+                                          ValuesOutputIterator values_output,
+                                          unsigned int size,
+                                          unsigned int bit,
+                                          unsigned int end_bit,
+                                          cudaStream_t stream,
+                                          bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 256U )
+            return radix_sort_single_limit256<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<64U, 5U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit512(KeysInputIterator keys_input,
+                                          KeysOutputIterator keys_output,
+                                          ValuesInputIterator values_input,
+                                          ValuesOutputIterator values_output,
+                                          unsigned int size,
+                                          unsigned int bit,
+                                          unsigned int end_bit,
+                                          cudaStream_t stream,
+                                          bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 320U )
+            return radix_sort_single_limit320<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<256U, 2U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit768(KeysInputIterator keys_input,
+                                          KeysOutputIterator keys_output,
+                                          ValuesInputIterator values_input,
+                                          ValuesOutputIterator values_output,
+                                          unsigned int size,
+                                          unsigned int bit,
+                                          unsigned int end_bit,
+                                          cudaStream_t stream,
+                                          bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 512U )
+            return radix_sort_single_limit512<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<256U, 3U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit1024(KeysInputIterator keys_input,
+                                           KeysOutputIterator keys_output,
+                                           ValuesInputIterator values_input,
+                                           ValuesOutputIterator values_output,
+                                           unsigned int size,
+                                           unsigned int bit,
+                                           unsigned int end_bit,
+                                           cudaStream_t stream,
+                                           bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 768U )
+            return radix_sort_single_limit768<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<256U, 4U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit1536(KeysInputIterator keys_input,
+                                           KeysOutputIterator keys_output,
+                                           ValuesInputIterator values_input,
+                                           ValuesOutputIterator values_output,
+                                           unsigned int size,
+                                           unsigned int bit,
+                                           unsigned int end_bit,
+                                           cudaStream_t stream,
+                                           bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 1024U )
+            return radix_sort_single_limit1024<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<256U, 6U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit2048(KeysInputIterator keys_input,
+                                           KeysOutputIterator keys_output,
+                                           ValuesInputIterator values_input,
+                                           ValuesOutputIterator values_output,
+                                           unsigned int size,
+                                           unsigned int bit,
+                                           unsigned int end_bit,
+                                           cudaStream_t stream,
+                                           bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 1536U )
+            return radix_sort_single_limit1536<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<256U, 8U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit2560(KeysInputIterator keys_input,
+                                           KeysOutputIterator keys_output,
+                                           ValuesInputIterator values_input,
+                                           ValuesOutputIterator values_output,
+                                           unsigned int size,
+                                           unsigned int bit,
+                                           unsigned int end_bit,
+                                           cudaStream_t stream,
+                                           bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 2048U )
+            return radix_sort_single_limit2048<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<256U, 10U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit3072(KeysInputIterator keys_input,
+                                           KeysOutputIterator keys_output,
+                                           ValuesInputIterator values_input,
+                                           ValuesOutputIterator values_output,
+                                           unsigned int size,
+                                           unsigned int bit,
+                                           unsigned int end_bit,
+                                           cudaStream_t stream,
+                                           bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 2560U )
+            return radix_sort_single_limit2560<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<256U, 12U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit3584(KeysInputIterator keys_input,
+                                           KeysOutputIterator keys_output,
+                                           ValuesInputIterator values_input,
+                                           ValuesOutputIterator values_output,
+                                           unsigned int size,
+                                           unsigned int bit,
+                                           unsigned int end_bit,
+                                           cudaStream_t stream,
+                                           bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 3072U )
+            return radix_sort_single_limit3072<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<256U, 14U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    cudaError_t radix_sort_single_limit4096(KeysInputIterator keys_input,
+                                           KeysOutputIterator keys_output,
+                                           ValuesInputIterator values_input,
+                                           ValuesOutputIterator values_output,
+                                           unsigned int size,
+                                           unsigned int bit,
+                                           unsigned int end_bit,
+                                           cudaStream_t stream,
+                                           bool debug_synchronous)
+    {
+        if( !Config::force_single_kernel_config && size <= 3584U )
+            return radix_sort_single_limit3584<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<256U, 16U, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 64U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit64<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 64U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 128U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit128<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 128U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 192U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit192<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 192U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 256U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit256<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 256U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 320U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit320<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 320U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 512U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit512<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 512U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 768U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit768<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 768U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 1024U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit1024<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 1024U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 1536U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit1536<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 1536U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 2048U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit2048<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 2048U) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 2560U,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit2560<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 2560) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 3072,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit3072<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 3072) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 3584,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit3584<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 3584) &&
+            Config::sort_single::items_per_thread * Config::sort_single::block_size <= 4096,
+            cudaError_t
+        >::type
+    {
+        return radix_sort_single_limit4096<Config, Descending>(
+            keys_input, keys_output, values_input, values_output,
+            size, bit, end_bit, stream, debug_synchronous
+        );
+    }
+
+    template<
+        class Config,
+        bool Descending,
+        class KeysInputIterator,
+        class KeysOutputIterator,
+        class ValuesInputIterator,
+        class ValuesOutputIterator
+    >
+    inline
+    auto radix_sort_single(KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           unsigned int size,
+                           unsigned int bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+        -> typename std::enable_if<
+            (Config::sort_single::items_per_thread * Config::sort_single::block_size > 4096),
+            cudaError_t
+        >::type
+    {
+        if( size < 4096 )
+            return radix_sort_single_limit4096<Config, Descending>(
+                keys_input, keys_output, values_input, values_output,
+                size, bit, end_bit, stream, debug_synchronous
+            );
+        else
+            return radix_sort_single<
+                Config::sort_single::block_size,
+                Config::sort_single::items_per_thread,
+                Descending
+            >(
+                    keys_input, keys_output, values_input, values_output,
+                    size, bit, end_bit, stream, debug_synchronous
+            );
+    }
+
+} // end namespace detail
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_SINGLE_SORT_HPP_
--- a/3rdparty/cub/rocprim/functional.hpp
+++ b/3rdparty/cub/rocprim/functional.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_FUNCTIONAL_HPP_
+#define ROCPRIM_FUNCTIONAL_HPP_
+
+#include <functional>
+
+// Meta configuration for rocPRIM
+#include "config.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup utilsmodule_functional
+/// @{
+
+#define ROCPRIM_PRINT_ERROR_ONCE(message) \
+{                                          \
+    unsigned int idx = threadIdx.x + (blockIdx.x * blockDim.x); \
+    idx += threadIdx.y + (blockIdx.y * blockDim.y);             \
+    idx += threadIdx.z + (blockIdx.z * blockDim.z);             \
+    if (idx == 0)                                                        \
+        printf("%s\n", #message);                                        \
+}
+
+template<class T>
+ROCPRIM_HOST_DEVICE inline
+constexpr T max(const T& a, const T& b)
+{
+    return a < b ? b : a;
+}
+
+template<class T>
+ROCPRIM_HOST_DEVICE inline
+constexpr T min(const T& a, const T& b)
+{
+    return a < b ? a : b;
+}
+
+template<class T>
+ROCPRIM_HOST_DEVICE inline
+void swap(T& a, T& b)
+{
+    T c = a;
+    a = b;
+    b = c;
+}
+
+template<class T = void>
+struct less
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a < b;
+    }
+};
+
+template<>
+struct less<void>
+{
+    template<class T, class U>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const U& b) const
+    {
+        return a < b;
+    }
+};
+
+template<class T = void>
+struct less_equal
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a <= b;
+    }
+};
+
+template<>
+struct less_equal<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a <= b;
+    }
+};
+
+template<class T = void>
+struct greater
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a > b;
+    }
+};
+
+template<>
+struct greater<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a > b;
+    }
+};
+
+template<class T = void>
+struct greater_equal
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a >= b;
+    }
+};
+
+template<>
+struct greater_equal<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a >= b;
+    }
+};
+
+template<class T = void>
+struct equal_to
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a == b;
+    }
+};
+
+template<>
+struct equal_to<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a == b;
+    }
+};
+
+template<class T = void>
+struct not_equal_to
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a != b;
+    }
+};
+
+template<>
+struct not_equal_to<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr bool operator()(const T& a, const T& b) const
+    {
+        return a != b;
+    }
+};
+
+template<class T = void>
+struct plus
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a + b;
+    }
+};
+
+template<>
+struct plus<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a + b;
+    }
+};
+
+template<class T = void>
+struct minus
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a - b;
+    }
+};
+
+template<>
+struct minus<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a - b;
+    }
+};
+
+template<class T = void>
+struct multiplies
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a * b;
+    }
+};
+
+template<>
+struct multiplies<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a * b;
+    }
+};
+
+template<class T = void>
+struct maximum
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a < b ? b : a;
+    }
+};
+
+template<>
+struct maximum<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a < b ? b : a;
+    }
+};
+
+template<class T = void>
+struct minimum
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a < b ? a : b;
+    }
+};
+
+template<>
+struct minimum<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a, const T& b) const
+    {
+        return a < b ? a : b;
+    }
+};
+
+template<class T = void>
+struct identity
+{
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a) const
+    {
+        return a;
+    }
+};
+
+template<>
+struct identity<void>
+{
+    template <typename T>
+    ROCPRIM_HOST_DEVICE inline
+    constexpr T operator()(const T& a) const
+    {
+        return a;
+    }
+};
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+/**
+ * \brief Type equality test
+ */
+template <typename A, typename B>
+struct Equals
+{
+    enum {
+        VALUE = 0,
+        NEGATE = 1
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename A>
+struct Equals <A, A>
+{
+    enum {
+        VALUE = 1,
+        NEGATE = 0
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+template <int A>
+struct Int2Type
+{
+   enum {VALUE = A};
+};
+
+/// @}
+// end of group utilsmodule_functional
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_FUNCTIONAL_HPP_
--- a/3rdparty/cub/rocprim/intrinsics.hpp
+++ b/3rdparty/cub/rocprim/intrinsics.hpp
+// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_INTRINSICS_HPP_
+#define ROCPRIM_INTRINSICS_HPP_
+
+// Meta configuration for rocPRIM
+#include "config.hpp"
+
+#include "intrinsics/atomic.hpp"
+#include "intrinsics/bit.hpp"
+#include "intrinsics/thread.hpp"
+#include "intrinsics/warp.hpp"
+#include "intrinsics/warp_shuffle.hpp"
+
+#endif // ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
--- a/3rdparty/cub/rocprim/intrinsics/atomic.hpp
+++ b/3rdparty/cub/rocprim/intrinsics/atomic.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_INTRINSICS_ATOMIC_HPP_
+#define ROCPRIM_INTRINSICS_ATOMIC_HPP_
+
+#include "../config.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int atomic_add(unsigned int * address, unsigned int value)
+    {
+        return ::atomicAdd(address, value);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    int atomic_add(int * address, int value)
+    {
+        return ::atomicAdd(address, value);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    float atomic_add(float * address, float value)
+    {
+        return ::atomicAdd(address, value);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned long long atomic_add(unsigned long long * address, unsigned long long value)
+    {
+        return ::atomicAdd(address, value);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int atomic_wrapinc(unsigned int * address, unsigned int value)
+    {
+        return ::atomicInc(address, value);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int atomic_exch(unsigned int * address, unsigned int value)
+    {
+        return ::atomicExch(address, value);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned long long atomic_exch(unsigned long long * address, unsigned long long value)
+    {
+        return ::atomicExch(address, value);
+    }
+}
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_INTRINSICS_ATOMIC_HPP_
--- a/3rdparty/cub/rocprim/intrinsics/bit.hpp
+++ b/3rdparty/cub/rocprim/intrinsics/bit.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_INTRINSICS_BIT_HPP_
+#define ROCPRIM_INTRINSICS_BIT_HPP_
+
+#include "../config.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup intrinsicsmodule
+/// @{
+
+/// \brief Returns a single bit at 'i' from 'x'
+ROCPRIM_DEVICE ROCPRIM_INLINE
+int get_bit(int x, int i)
+{
+    return (x >> i) & 1;
+}
+
+/// \brief Bit count
+///
+/// Returns the number of bit of \p x set.
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int bit_count(unsigned int x)
+{
+    return __popc(x);
+}
+
+/// \brief Bit count
+///
+/// Returns the number of bit of \p x set.
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int bit_count(unsigned long long x)
+{
+    return __popcll(x);
+}
+
+/// @}
+// end of group intrinsicsmodule
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_INTRINSICS_BIT_HPP_
--- a/3rdparty/cub/rocprim/intrinsics/thread.hpp
+++ b/3rdparty/cub/rocprim/intrinsics/thread.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_INTRINSICS_THREAD_HPP_
+#define ROCPRIM_INTRINSICS_THREAD_HPP_
+
+#include <atomic>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup intrinsicsmodule
+/// @{
+
+// Sizes
+
+/// \brief [DEPRECATED] Returns a number of threads in a hardware warp.
+///
+/// It is constant for a device.
+/// This function is not supported for the gfx1030 architecture and will be removed in a future release.
+/// Please use the new host_warp_size() and device_warp_size() functions.
+ROCPRIM_HOST_DEVICE inline
+constexpr unsigned int warp_size()
+{
+    return warpSize;
+}
+
+/// \brief Returns a number of threads in a hardware warp for the actual device.
+/// At host side this constant is available at runtime time only.
+///
+/// It is constant for a device.
+ROCPRIM_HOST inline
+unsigned int host_warp_size()
+{
+    int default_hip_device;
+    cudaError_t success = cudaGetDevice(&default_hip_device);
+    cudaDeviceProp device_prop;
+    success = cudaGetDeviceProperties(&device_prop,default_hip_device);
+
+    if(success != cudaSuccess)
+        return -1;
+    else
+        return device_prop.warpSize;
+};
+
+/// \brief Returns a number of threads in a hardware warp for the actual target.
+/// At device side this constant is available at compile time.
+///
+/// It is constant for a device.
+ROCPRIM_DEVICE ROCPRIM_INLINE
+constexpr unsigned int device_warp_size()
+{
+    return warpSize;
+}
+
+/// \brief Returns flat size of a multidimensional block (tile).
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int flat_block_size()
+{
+    return blockDim.z * blockDim.y * blockDim.x;
+}
+
+/// \brief Returns flat size of a multidimensional tile (block).
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int flat_tile_size()
+{
+    return flat_block_size();
+}
+
+// IDs
+
+/// \brief Returns thread identifier in a warp.
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int lane_id()
+{
+#ifndef __HIP_CPU_RT__
+    return ::__lane_id();
+#else
+    using namespace hip::detail;
+    return id(Fiber::this_fiber()) % warpSize;
+#endif
+}
+
+/// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile).
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int flat_block_thread_id()
+{
+    return (threadIdx.z * blockDim.y * blockDim.x)
+        + (threadIdx.y * blockDim.x)
+        + threadIdx.x;
+}
+
+/// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile). Use template parameters to optimize 1D or 2D kernels.
+template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto flat_block_thread_id()
+    -> typename std::enable_if<(BlockSizeY == 1 && BlockSizeZ == 1), unsigned int>::type
+{
+    return threadIdx.x;
+}
+
+template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto flat_block_thread_id()
+    -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ == 1), unsigned int>::type
+{
+    return threadIdx.x + (threadIdx.y * blockDim.x);
+}
+
+template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto flat_block_thread_id()
+    -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ > 1), unsigned int>::type
+{
+    return threadIdx.x + (threadIdx.y * blockDim.x) +
+           (threadIdx.z * blockDim.y * blockDim.x);
+}
+
+/// \brief Returns flat (linear, 1D) thread identifier in a multidimensional tile (block).
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int flat_tile_thread_id()
+{
+    return flat_block_thread_id();
+}
+
+/// \brief Returns warp id in a block (tile).
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int warp_id()
+{
+    return flat_block_thread_id()/device_warp_size();
+}
+
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int warp_id(unsigned int flat_id)
+{
+    return flat_id/device_warp_size();
+}
+
+/// \brief Returns warp id in a block (tile). Use template parameters to optimize 1D or 2D kernels.
+template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int warp_id()
+{
+    return flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>()/device_warp_size();
+}
+
+/// \brief Returns flat (linear, 1D) block identifier in a multidimensional grid.
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int flat_block_id()
+{
+    return (blockIdx.z * gridDim.y * gridDim.x)
+        + (blockIdx.y * gridDim.x)
+        + blockIdx.x;
+}
+
+template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto flat_block_id()
+    -> typename std::enable_if<(BlockSizeY == 1 && BlockSizeZ == 1), unsigned int>::type
+{
+    return blockIdx.x;
+}
+
+template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto flat_block_id()
+    -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ == 1), unsigned int>::type
+{
+    return blockIdx.x + (blockIdx.y * gridDim.x);
+}
+
+template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto flat_block_id()
+    -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ > 1), unsigned int>::type
+{
+    return blockIdx.x + (blockIdx.y * gridDim.x) +
+           (blockIdx.z * gridDim.y * gridDim.x);
+}
+
+// Sync
+
+/// \brief Synchronize all threads in a block (tile)
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void syncthreads()
+{
+    __syncthreads();
+}
+
+/// \brief All lanes in a wave come to convergence point simultaneously
+/// with SIMT, thus no special instruction is needed in the ISA
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void wave_barrier()
+{
+    __builtin_amdgcn_wave_barrier();
+}
+
+namespace detail
+{
+    /// \brief Returns thread identifier in a multidimensional block (tile) by dimension.
+    template<unsigned int Dim>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int block_thread_id()
+    {
+        static_assert(Dim > 2, "Dim must be 0, 1 or 2");
+        // dummy return, correct values handled by specializations
+        return 0;
+    }
+
+    /// \brief Returns block identifier in a multidimensional grid by dimension.
+    template<unsigned int Dim>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int block_id()
+    {
+        static_assert(Dim > 2, "Dim must be 0, 1 or 2");
+        // dummy return, correct values handled by specializations
+        return 0;
+    }
+
+    /// \brief Returns block size in a multidimensional grid by dimension.
+    template<unsigned int Dim>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int block_size()
+    {
+        static_assert(Dim > 2, "Dim must be 0, 1 or 2");
+        // dummy return, correct values handled by specializations
+        return 0;
+    }
+
+    /// \brief Returns grid size by dimension.
+    template<unsigned int Dim>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int grid_size()
+    {
+        static_assert(Dim > 2, "Dim must be 0, 1 or 2");
+        // dummy return, correct values handled by specializations
+        return 0;
+    }
+
+    #define ROCPRIM_DETAIL_CONCAT(A, B) A B
+    #define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, dim, suffix) \
+        template<> \
+        ROCPRIM_DEVICE ROCPRIM_INLINE \
+        unsigned int name<dim>() \
+        { \
+            return ROCPRIM_DETAIL_CONCAT(prefix, suffix); \
+        }
+    #define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(name, prefix) \
+        ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 0, x) \
+        ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 1, y) \
+        ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 2, z)
+
+    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_thread_id, threadIdx.)
+    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_id, blockIdx.)
+    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_size, blockDim.)
+    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(grid_size, gridDim.)
+
+    #undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS
+    #undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC
+    #undef ROCPRIM_DETAIL_CONCAT
+
+    // Return thread id in a "logical warp", which can be smaller than a hardware warp size.
+    template<unsigned int LogicalWarpSize>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    auto logical_lane_id()
+        -> typename std::enable_if<detail::is_power_of_two(LogicalWarpSize), unsigned int>::type
+    {
+        return lane_id() & (LogicalWarpSize-1); // same as land_id()%WarpSize
+    }
+
+    template<unsigned int LogicalWarpSize>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    auto logical_lane_id()
+        -> typename std::enable_if<!detail::is_power_of_two(LogicalWarpSize), unsigned int>::type
+    {
+        return lane_id()%LogicalWarpSize;
+    }
+
+    template<>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int logical_lane_id<device_warp_size()>()
+    {
+        return lane_id();
+    }
+
+    // Return id of "logical warp" in a block
+    template<unsigned int LogicalWarpSize>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int logical_warp_id()
+    {
+        return flat_block_thread_id()/LogicalWarpSize;
+    }
+
+    template<>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int logical_warp_id<device_warp_size()>()
+    {
+        return warp_id();
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void memory_fence_system()
+    {
+        ::__threadfence_system();
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void memory_fence_block()
+    {
+        ::__threadfence_block();
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void memory_fence_device()
+    {
+        ::__threadfence();
+    }
+}
+
+/// @}
+// end of group intrinsicsmodule
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_INTRINSICS_THREAD_HPP_
--- a/3rdparty/cub/rocprim/intrinsics/warp.hpp
+++ b/3rdparty/cub/rocprim/intrinsics/warp.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_INTRINSICS_WARP_HPP_
+#define ROCPRIM_INTRINSICS_WARP_HPP_
+
+#include "../config.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup intrinsicsmodule
+/// @{
+
+/// Evaluate predicate for all active work-items in the warp and return an integer
+/// whose <tt>i</tt>-th bit is set if and only if \p predicate is <tt>true</tt>
+/// for the <tt>i</tt>-th thread of the warp and the <tt>i</tt>-th thread is active.
+///
+/// \param predicate - input to be evaluated for all active lanes
+ROCPRIM_DEVICE ROCPRIM_INLINE
+lane_mask_type ballot(int predicate)
+{
+    return ::__ballot(predicate);
+}
+
+/// \brief Masked bit count
+///
+/// For each thread, this function returns the number of active threads which
+/// have <tt>i</tt>-th bit of \p x set and come before the current thread.
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int masked_bit_count(lane_mask_type x, unsigned int add = 0)
+{
+    int c;
+    #ifndef __HIP_CPU_RT__
+        #if __AMDGCN_WAVEFRONT_SIZE == 32
+            #ifdef __CUDACC__
+            c = ::__builtin_amdgcn_mbcnt_lo(x, add);
+            #else
+            c = ::__mbcnt_lo(x, add);
+            #endif
+        #else
+            #ifdef __CUDACC__
+            c = ::__builtin_amdgcn_mbcnt_lo(static_cast<int>(x), add);
+            c = ::__builtin_amdgcn_mbcnt_hi(static_cast<int>(x >> 32), c);
+            #else
+            c = ::__mbcnt_lo(static_cast<int>(x), add);
+            c = ::__mbcnt_hi(static_cast<int>(x >> 32), c);
+            #endif
+        #endif
+    #else
+        using namespace hip::detail;
+        const auto tidx{id(Fiber::this_fiber()) % warpSize};
+        std::bitset<warpSize> bits{x >> (warpSize - tidx)};
+        c = static_cast<unsigned int>(bits.count()) + add;
+    #endif
+    return c;
+}
+
+namespace detail
+{
+
+ROCPRIM_DEVICE ROCPRIM_INLINE
+int warp_any(int predicate)
+{
+#ifndef __HIP_CPU_RT__
+    return ::__any(predicate);
+#else
+    using namespace hip::detail;
+    const auto tidx{id(Fiber::this_fiber()) % warpSize};
+    auto& lds{Tile::scratchpad<std::bitset<warpSize>, 1>()[0]};
+
+    lds[tidx] = static_cast<bool>(predicate);
+
+    barrier(Tile::this_tile());
+
+    return lds.any();
+#endif
+}
+
+ROCPRIM_DEVICE ROCPRIM_INLINE
+int warp_all(int predicate)
+{
+#ifndef __HIP_CPU_RT__
+    return ::__all(predicate);
+#else
+    using namespace hip::detail;
+    const auto tidx{id(Fiber::this_fiber()) % warpSize};
+    auto& lds{Tile::scratchpad<std::bitset<warpSize>, 1>()[0]};
+
+    lds[tidx] = static_cast<bool>(predicate);
+
+    barrier(Tile::this_tile());
+
+    return lds.all();
+#endif
+}
+
+} // end detail namespace
+
+/// @}
+// end of group intrinsicsmodule
+
+/**
+ * Compute a 32b mask of threads having the same least-significant
+ * LABEL_BITS of \p label as the calling thread.
+ */
+template <int LABEL_BITS>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+unsigned int MatchAny(unsigned int label)
+{
+    unsigned int retval;
+
+    // Extract masks of common threads for each bit
+    ROCPRIM_UNROLL
+    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
+    {
+        unsigned long long  mask;
+        unsigned long long current_bit = 1 << BIT;
+        mask = label & current_bit;
+        bool bit_match = (mask==current_bit);
+        mask = ballot(bit_match);
+        if(!bit_match)
+        {
+          mask = ! mask;
+        }
+        // Remove peers who differ
+        retval = (BIT == 0) ? mask : retval & mask;
+    }
+
+    return retval;
+
+}
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_INTRINSICS_WARP_HPP_
--- a/3rdparty/cub/rocprim/intrinsics/warp_shuffle.hpp
+++ b/3rdparty/cub/rocprim/intrinsics/warp_shuffle.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
+#define ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "thread.hpp"
+
+/// \addtogroup warpmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+
+#ifdef __HIP_CPU_RT__
+// TODO: consider adding macro checks relaying to std::bit_cast when compiled
+//       using C++20.
+template <class To, class From>
+typename std::enable_if_t<
+    sizeof(To) == sizeof(From) &&
+    std::is_trivially_copyable_v<From> &&
+    std::is_trivially_copyable_v<To>,
+    To>
+// constexpr support needs compiler magic
+bit_cast(const From& src) noexcept
+{
+    To dst;
+    std::memcpy(&dst, &src, sizeof(To));
+    return dst;
+}
+#endif
+
+template<class T, class ShuffleOp>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+typename std::enable_if<std::is_trivially_copyable<T>::value && (sizeof(T) % sizeof(int) == 0), T>::type
+warp_shuffle_op(const T& input, ShuffleOp&& op)
+{
+    constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int);
+
+    struct V { int words[words_no]; };
+#ifdef __HIP_CPU_RT__
+    V a = bit_cast<V>(input);
+#else
+    V a = __builtin_bit_cast(V, input);
+#endif
+
+    ROCPRIM_UNROLL
+    for(int i = 0; i < words_no; i++)
+    {
+        a.words[i] = op(a.words[i]);
+    }
+
+#ifdef __HIP_CPU_RT__
+    return bit_cast<T>(a);
+#else
+    return __builtin_bit_cast(T, a);
+#endif
+}
+
+template<class T, class ShuffleOp>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+typename std::enable_if<!(std::is_trivially_copyable<T>::value && (sizeof(T) % sizeof(int) == 0)), T>::type
+warp_shuffle_op(const T& input, ShuffleOp&& op)
+{
+    constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int);
+
+    T output;
+    ROCPRIM_UNROLL
+    for(int i = 0; i < words_no; i++)
+    {
+        const size_t s = std::min(sizeof(int), sizeof(T) - i * sizeof(int));
+        int word;
+#ifdef __HIP_CPU_RT__
+        std::memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), s);
+#else
+        __builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), s);
+#endif
+        word = op(word);
+#ifdef __HIP_CPU_RT__
+        std::memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, s);
+#else
+        __builtin_memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, s);
+#endif
+    }
+
+    return output;
+
+}
+
+template<class T, int dpp_ctrl, int row_mask = 0xf, int bank_mask = 0xf, bool bound_ctrl = false>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+T warp_move_dpp(const T& input)
+{
+    return detail::warp_shuffle_op(
+        input,
+        [=](int v) -> int
+        {
+            // TODO: clean-up, this function activates based ROCPRIM_DETAIL_USE_DPP, however inclusion and
+            //       parsing of the template happens unconditionally. The condition causing compilation to
+            //       fail is ordinary host-compilers looking at the headers. Non-hipcc compilers don't define
+            //       __builtin_amdgcn_update_dpp, hence fail to parse the template altogether. (Except MSVC
+            //       because even using /permissive- they somehow still do delayed parsing of the body of
+            //       function templates, even though they pinky-swear they don't.)
+#if !defined(__HIP_CPU_RT__)
+            return ::__builtin_amdgcn_mov_dpp(v, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+#else
+            return v;
+#endif
+        }
+    );
+}
+
+/// \brief Swizzle for any data type.
+///
+/// Each thread in warp obtains \p input from <tt>src_lane</tt>-th thread
+/// in warp, where <tt>src_lane</tt> is current lane with a <tt>mask</tt> applied.
+///
+/// \param input - input to pass to other threads
+template<class T, int mask>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+T warp_swizzle(const T& input)
+{
+    return detail::warp_shuffle_op(
+        input,
+        [=](int v) -> int
+        {
+            return ::__builtin_amdgcn_ds_swizzle(v, mask);
+        }
+    );
+}
+
+} // end namespace detail
+
+/// \brief Shuffle for any data type.
+///
+/// Each thread in warp obtains \p input from <tt>src_lane</tt>-th thread
+/// in warp. If \p width is less than device_warp_size() then each subsection of the
+/// warp behaves as a separate entity with a starting logical lane id of 0.
+/// If \p src_lane is not in [0; \p width) range, the returned value is
+/// equal to \p input passed by the <tt>src_lane modulo width</tt> thread.
+///
+/// Note: The optional \p width parameter must be a power of 2; results are
+/// undefined if it is not a power of 2, or it is greater than device_warp_size().
+///
+/// \param input - input to pass to other threads
+/// \param src_lane - warp if of a thread whose \p input should be returned
+/// \param width - logical warp width
+template<class T>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+T warp_shuffle(const T& input, const int src_lane, const int width = device_warp_size())
+{
+    return detail::warp_shuffle_op(
+        input,
+        [=](int v) -> int
+        {
+            return __shfl(v, src_lane, width);
+        }
+    );
+}
+
+/// \brief Shuffle up for any data type.
+///
+/// <tt>i</tt>-th thread in warp obtains \p input from <tt>i-delta</tt>-th
+/// thread in warp. If \p <tt>i-delta</tt> is not in [0; \p width) range,
+/// thread's own \p input is returned.
+///
+/// Note: The optional \p width parameter must be a power of 2; results are
+/// undefined if it is not a power of 2, or it is greater than device_warp_size().
+///
+/// \param input - input to pass to other threads
+/// \param delta - offset for calculating source lane id
+/// \param width - logical warp width
+template<class T>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+T warp_shuffle_up(const T& input, const unsigned int delta, const int width = device_warp_size())
+{
+    return detail::warp_shuffle_op(
+        input,
+        [=](int v) -> int
+        {
+            return __shfl_up(v, delta, width);
+        }
+    );
+}
+
+/// \brief Shuffle down for any data type.
+///
+/// <tt>i</tt>-th thread in warp obtains \p input from <tt>i+delta</tt>-th
+/// thread in warp. If \p <tt>i+delta</tt> is not in [0; \p width) range,
+/// thread's own \p input is returned.
+///
+/// Note: The optional \p width parameter must be a power of 2; results are
+/// undefined if it is not a power of 2, or it is greater than device_warp_size().
+///
+/// \param input - input to pass to other threads
+/// \param delta - offset for calculating source lane id
+/// \param width - logical warp width
+template<class T>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+T warp_shuffle_down(const T& input, const unsigned int delta, const int width = device_warp_size())
+{
+    return detail::warp_shuffle_op(
+        input,
+        [=](int v) -> int
+        {
+            return __shfl_down(v, delta, width);
+        }
+    );
+}
+
+/// \brief Shuffle XOR for any data type.
+///
+/// <tt>i</tt>-th thread in warp obtains \p input from <tt>i^lane_mask</tt>-th
+/// thread in warp.
+///
+/// Note: The optional \p width parameter must be a power of 2; results are
+/// undefined if it is not a power of 2, or it is greater than device_warp_size().
+///
+/// \param input - input to pass to other threads
+/// \param lane_mask - mask used for calculating source lane id
+/// \param width - logical warp width
+template<class T>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+T warp_shuffle_xor(const T& input, const int lane_mask, const int width = device_warp_size())
+{
+    return detail::warp_shuffle_op(
+        input,
+        [=](int v) -> int
+        {
+            return __shfl_xor(v, lane_mask, width);
+        }
+    );
+}
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
+
+/// @}
+// end of group warpmodule
--- a/3rdparty/cub/rocprim/iterator.hpp
+++ b/3rdparty/cub/rocprim/iterator.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_ITERATOR_HPP_
+#define ROCPRIM_ITERATOR_HPP_
+
+// Meta configuration for rocPRIM
+#include "config.hpp"
+
+#include "iterator/arg_index_iterator.hpp"
+#include "iterator/constant_iterator.hpp"
+#include "iterator/counting_iterator.hpp"
+#include "iterator/discard_iterator.hpp"
+#ifndef __HIP_CPU_RT__
+#include "iterator/texture_cache_iterator.hpp"
+#endif
+#include "iterator/transform_iterator.hpp"
+#include "iterator/zip_iterator.hpp"
+
+#endif // ROCPRIM_ITERATOR_HPP_
--- a/3rdparty/cub/rocprim/iterator/arg_index_iterator.hpp
+++ b/3rdparty/cub/rocprim/iterator/arg_index_iterator.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_ITERATOR_ARG_INDEX_ITERATOR_HPP_
+#define ROCPRIM_ITERATOR_ARG_INDEX_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+#include <cstddef>
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../types/key_value_pair.hpp"
+
+/// \addtogroup iteratormodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \class arg_index_iterator
+/// \brief A random-access input (read-only) iterator adaptor for pairing dereferenced values
+/// with their indices.
+///
+/// \par Overview
+/// * Dereferencing arg_index_iterator return a value of \p key_value_pair<Difference, InputValueType>
+/// type, which includes value from the underlying range and its index in that range.
+/// * \p std::iterator_traits<InputIterator>::value_type should be convertible to \p InputValueType.
+///
+/// \tparam InputIterator - type of the underlying random-access input iterator. Must be
+/// a random-access iterator.
+/// \tparam Difference - type used for identify distance between iterators and as the index type
+/// in the output pair type (see \p value_type).
+/// \tparam InputValueType - value type used in the output pair type (see \p value_type).
+template<
+    class InputIterator,
+    class Difference = std::ptrdiff_t,
+    class InputValueType = typename std::iterator_traits<InputIterator>::value_type
+>
+class arg_index_iterator
+{
+private:
+    using input_category = typename std::iterator_traits<InputIterator>::iterator_category;
+
+public:
+    /// The type of the value that can be obtained by dereferencing the iterator.
+    using value_type = ::rocprim::key_value_pair<Difference, InputValueType>;
+    /// \brief A reference type of the type iterated over (\p value_type).
+    /// It's `const` since arg_index_iterator is a read-only iterator.
+    using reference = const value_type&;
+    /// \brief A pointer type of the type iterated over (\p value_type).
+    /// It's `const` since arg_index_iterator is a read-only iterator.
+    using pointer = const value_type*;
+    /// A type used for identify distance between iterators.
+    using difference_type = Difference;
+    /// The category of the iterator.
+    using iterator_category = std::random_access_iterator_tag;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    using self_type = arg_index_iterator;
+#endif
+
+    static_assert(
+        std::is_same<input_category, iterator_category>::value,
+        "InputIterator must be a random-access iterator"
+    );
+
+    ROCPRIM_HOST_DEVICE inline
+    ~arg_index_iterator() = default;
+
+    /// \brief Creates a new arg_index_iterator.
+    ///
+    /// \param iterator input iterator pointing to the input range.
+    /// \param offset index of the \p iterator in the input range.
+    ROCPRIM_HOST_DEVICE inline
+    arg_index_iterator(InputIterator iterator, difference_type offset = 0)
+        : iterator_(iterator), offset_(offset)
+    {
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    arg_index_iterator& operator++()
+    {
+        iterator_++;
+        offset_++;
+        return *this;
+    }
+
+    //! \skip_doxy_start
+    ROCPRIM_HOST_DEVICE inline
+    arg_index_iterator operator++(int)
+    {
+        arg_index_iterator old_ai = *this;
+        iterator_++;
+        offset_++;
+        return old_ai;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    value_type operator*() const
+    {
+        value_type ret(offset_, *iterator_);
+        return ret;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    pointer operator->() const
+    {
+        return &(*(*this));
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    arg_index_iterator operator+(difference_type distance) const
+    {
+        return arg_index_iterator(iterator_ + distance, offset_ + distance);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    arg_index_iterator& operator+=(difference_type distance)
+    {
+        iterator_ += distance;
+        offset_ += distance;
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    arg_index_iterator operator-(difference_type distance) const
+    {
+        return arg_index_iterator(iterator_ - distance, offset_ - distance);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    arg_index_iterator& operator-=(difference_type distance)
+    {
+        iterator_ -= distance;
+        offset_ -= distance;
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    difference_type operator-(arg_index_iterator other) const
+    {
+        return iterator_ - other.iterator_;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    value_type operator[](difference_type distance) const
+    {
+        arg_index_iterator i = (*this) + distance;
+        return *i;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator==(arg_index_iterator other) const
+    {
+        return (iterator_ == other.iterator_) && (offset_ == other.offset_);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator!=(arg_index_iterator other) const
+    {
+        return (iterator_ != other.iterator_) || (offset_ != other.offset_);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator<(arg_index_iterator other) const
+    {
+        return (iterator_ - other.iterator_) > 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator<=(arg_index_iterator other) const
+    {
+        return (iterator_ - other.iterator_) >= 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator>(arg_index_iterator other) const
+    {
+        return (iterator_ - other.iterator_) < 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator>=(arg_index_iterator other) const
+    {
+        return (iterator_ - other.iterator_) <= 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    void normalize()
+    {
+        offset_ = 0;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const arg_index_iterator& /* iter */)
+    {
+        return os;
+    }
+    //! \skip_doxy_end
+
+private:
+    InputIterator iterator_;
+    difference_type offset_;
+};
+
+template<
+    class InputIterator,
+    class Difference,
+    class InputValueType
+>
+ROCPRIM_HOST_DEVICE inline
+arg_index_iterator<InputIterator, Difference, InputValueType>
+operator+(typename arg_index_iterator<InputIterator, Difference, InputValueType>::difference_type distance,
+          const arg_index_iterator<InputIterator, Difference, InputValueType>& iterator)
+{
+    return iterator + distance;
+}
+
+
+/// make_arg_index_iterator creates a arg_index_iterator using \p iterator as
+/// the underlying iterator and \p offset as the position (index) of \p iterator
+/// in the input range.
+///
+/// \tparam InputIterator - type of the underlying random-access input iterator. Must be
+/// a random-access iterator.
+/// \tparam Difference - type used for identify distance between iterators and as the index type
+/// in the output pair type (see \p value_type in arg_index_iterator).
+/// \tparam InputValueType - value type used in the output pair type (see \p value_type
+/// in arg_index_iterator).
+///
+/// \param iterator input iterator pointing to the input range.
+/// \param offset index of the \p iterator in the input range.
+template<
+    class InputIterator,
+    class Difference = std::ptrdiff_t,
+    class InputValueType = typename std::iterator_traits<InputIterator>::value_type
+>
+ROCPRIM_HOST_DEVICE inline
+arg_index_iterator<InputIterator, Difference, InputValueType>
+make_arg_index_iterator(InputIterator iterator, Difference offset = 0)
+{
+    return arg_index_iterator<InputIterator, Difference, InputValueType>(iterator, offset);
+}
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group iteratormodule
+
+#endif // ROCPRIM_ITERATOR_ARG_INDEX_ITERATOR_HPP_
--- a/3rdparty/cub/rocprim/iterator/constant_iterator.hpp
+++ b/3rdparty/cub/rocprim/iterator/constant_iterator.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_ITERATOR_CONSTANT_ITERATOR_HPP_
+#define ROCPRIM_ITERATOR_CONSTANT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+#include <cstddef>
+#include <type_traits>
+
+#include "../config.hpp"
+
+/// \addtogroup iteratormodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \class constant_iterator
+/// \brief A random-access input (read-only) iterator which generates a sequence
+/// of homogeneous values.
+///
+/// \par Overview
+/// * A constant_iterator represents a pointer into a range of same values.
+/// * Using it for simulating a range filled with a sequence of same values saves
+/// memory capacity and bandwidth.
+///
+/// \tparam ValueType - type of value that can be obtained by dereferencing the iterator.
+/// \tparam Difference - a type used for identify distance between iterators
+template<
+    class ValueType,
+    class Difference = std::ptrdiff_t
+>
+class constant_iterator
+{
+public:
+    /// The type of the value that can be obtained by dereferencing the iterator.
+    using value_type = typename std::remove_const<ValueType>::type;
+    /// \brief A reference type of the type iterated over (\p value_type).
+    /// It's same as `value_type` since constant_iterator is a read-only
+    /// iterator and does not have underlying buffer.
+    using reference = value_type; // constant_iterator is not writable
+    /// \brief A pointer type of the type iterated over (\p value_type).
+    /// It's `const` since constant_iterator is a read-only iterator.
+    using pointer = const value_type*; // constant_iterator is not writable
+    /// A type used for identify distance between iterators.
+    using difference_type = Difference;
+    /// The category of the iterator.
+    using iterator_category = std::random_access_iterator_tag;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    using self_type = constant_iterator;
+#endif
+
+    /// \brief Creates constant_iterator and sets its initial value to \p value.
+    ///
+    /// \param value initial value
+    /// \param index optional index for constant_iterator
+    ROCPRIM_HOST_DEVICE inline
+    explicit constant_iterator(const value_type value, const size_t index = 0)
+        : value_(value), index_(index)
+    {
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    ~constant_iterator() = default;
+
+    //! \skip_doxy_start
+    ROCPRIM_HOST_DEVICE inline
+    value_type operator*() const
+    {
+        return value_;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    pointer operator->() const
+    {
+        return &value_;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    constant_iterator& operator++()
+    {
+        index_++;
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    constant_iterator operator++(int)
+    {
+        constant_iterator old_ci = *this;
+        index_++;
+        return old_ci;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    constant_iterator& operator--()
+    {
+        index_--;
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    constant_iterator operator--(int)
+    {
+        constant_iterator old_ci = *this;
+        index_--;
+        return old_ci;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    constant_iterator operator+(difference_type distance) const
+    {
+        return constant_iterator(value_, index_ + distance);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    constant_iterator& operator+=(difference_type distance)
+    {
+        index_ += distance;
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    constant_iterator operator-(difference_type distance) const
+    {
+        return constant_iterator(value_, index_ - distance);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    constant_iterator& operator-=(difference_type distance)
+    {
+        index_ -= distance;
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    difference_type operator-(constant_iterator other) const
+    {
+        return static_cast<difference_type>(index_ - other.index_);
+    }
+    //! \skip_doxy_end
+
+    /// Constant_iterator is not writable, so we don't return reference,
+    /// just something convertible to reference. That matches requirement
+    /// of RandomAccessIterator concept
+    ROCPRIM_HOST_DEVICE inline
+    value_type operator[](difference_type) const
+    {
+        return value_;
+    }
+
+    //! \skip_doxy_start
+    ROCPRIM_HOST_DEVICE inline
+    bool operator==(constant_iterator other) const
+    {
+        return value_ == other.value_ && index_ == other.index_;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator!=(constant_iterator other) const
+    {
+        return !(*this == other);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator<(constant_iterator other) const
+    {
+        return distance_to(other) > 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator<=(constant_iterator other) const
+    {
+        return distance_to(other) >= 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator>(constant_iterator other) const
+    {
+        return distance_to(other) < 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator>=(constant_iterator other) const
+    {
+        return distance_to(other) <= 0;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const constant_iterator& iter)
+    {
+        os << "[" << iter.value_ << "]";
+        return os;
+    }
+    //! \skip_doxy_end
+
+private:
+    inline
+    difference_type distance_to(const constant_iterator& other) const
+    {
+        return difference_type(other.index_) - difference_type(index_);
+    }
+
+    value_type value_;
+    size_t index_;
+};
+
+template<
+    class ValueType,
+    class Difference
+>
+ROCPRIM_HOST_DEVICE inline
+constant_iterator<ValueType, Difference>
+operator+(typename constant_iterator<ValueType, Difference>::difference_type distance,
+          const constant_iterator<ValueType, Difference>& iter)
+{
+    return iter + distance;
+}
+
+/// make_constant_iterator creates a constant_iterator with its initial value
+/// set to \p value.
+///
+/// \tparam ValueType - type of value that can be obtained by dereferencing created iterator.
+/// \tparam Difference - a type used for identify distance between constant_iterator iterators.
+///
+/// \param value - initial value for constant_iterator.
+/// \param index - optional index for constant_iterator.
+template<
+    class ValueType,
+    class Difference = std::ptrdiff_t
+>
+ROCPRIM_HOST_DEVICE inline
+constant_iterator<ValueType, Difference>
+make_constant_iterator(ValueType value, size_t index = 0)
+{
+    return constant_iterator<ValueType, Difference>(value, index);
+}
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group iteratormodule
+
+#endif // ROCPRIM_ITERATOR_CONSTANT_ITERATOR_HPP_
--- a/3rdparty/cub/rocprim/iterator/counting_iterator.hpp
+++ b/3rdparty/cub/rocprim/iterator/counting_iterator.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_ITERATOR_COUNTING_ITERATOR_HPP_
+#define ROCPRIM_ITERATOR_COUNTING_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+#include <cstddef>
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../type_traits.hpp"
+
+/// \addtogroup iteratormodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \class counting_iterator
+/// \brief A random-access input (read-only) iterator over a sequence of consecutive integer values.
+///
+/// \par Overview
+/// * A counting_iterator represents a pointer into a range of sequentially increasing values.
+/// * Using it for simulating a range filled with a sequence of consecutive values saves
+/// memory capacity and bandwidth.
+///
+/// \tparam Incrementable - type of value that can be obtained by dereferencing the iterator.
+/// \tparam Difference - a type used for identify distance between iterators
+template<
+    class Incrementable,
+    class Difference = std::ptrdiff_t
+>
+class counting_iterator
+{
+public:
+    /// The type of the value that can be obtained by dereferencing the iterator.
+    using value_type = typename std::remove_const<Incrementable>::type;
+    /// \brief A reference type of the type iterated over (\p value_type).
+    /// It's same as `value_type` since constant_iterator is a read-only
+    /// iterator and does not have underlying buffer.
+    using reference = value_type; // counting_iterator is not writable
+    /// \brief A pointer type of the type iterated over (\p value_type).
+    /// It's `const` since counting_iterator is a read-only iterator.
+    using pointer = const value_type*; // counting_iterator is not writable
+    /// A type used for identify distance between iterators.
+    using difference_type = Difference;
+    /// The category of the iterator.
+    using iterator_category = std::random_access_iterator_tag;
+
+    static_assert(std::is_integral<value_type>::value, "Incrementable must be integral type");
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    using self_type = counting_iterator;
+#endif
+
+    ROCPRIM_HOST_DEVICE inline
+    counting_iterator() = default;
+
+    /// \brief Creates counting_iterator with its initial value initialized
+    /// to its default value (usually 0).
+    ROCPRIM_HOST_DEVICE inline
+    ~counting_iterator() = default;
+
+    /// \brief Creates counting_iterator and sets its initial value to \p value_.
+    ///
+    /// \param value initial value
+    ROCPRIM_HOST_DEVICE inline
+    explicit counting_iterator(const value_type value) : value_(value)
+    {
+    }
+
+    //! \skip_doxy_start
+    ROCPRIM_HOST_DEVICE inline
+    counting_iterator& operator++()
+    {
+        value_++;
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    counting_iterator operator++(int)
+    {
+        counting_iterator old_ci = *this;
+        value_++;
+        return old_ci;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    counting_iterator& operator--()
+    {
+        value_--;
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    counting_iterator operator--(int)
+    {
+        counting_iterator old_ci = *this;
+        value_--;
+        return old_ci;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    value_type operator*() const
+    {
+        return value_;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    pointer operator->() const
+    {
+        return &value_;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    counting_iterator operator+(difference_type distance) const
+    {
+        return counting_iterator(value_ + static_cast<value_type>(distance));
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    counting_iterator& operator+=(difference_type distance)
+    {
+        value_ += static_cast<value_type>(distance);
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    counting_iterator operator-(difference_type distance) const
+    {
+        return counting_iterator(value_ - static_cast<value_type>(distance));
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    counting_iterator& operator-=(difference_type distance)
+    {
+        value_ -= static_cast<value_type>(distance);
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    difference_type operator-(counting_iterator other) const
+    {
+        return static_cast<difference_type>(value_ - other.value_);
+    }
+
+    // counting_iterator is not writable, so we don't return reference,
+    // just something convertible to reference. That matches requirement
+    // of RandomAccessIterator concept
+    ROCPRIM_HOST_DEVICE inline
+    value_type operator[](difference_type distance) const
+    {
+        return value_ + static_cast<value_type>(distance);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator==(counting_iterator other) const
+    {
+        return this->equal_value(value_, other.value_);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator!=(counting_iterator other) const
+    {
+        return !(*this == other);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator<(counting_iterator other) const
+    {
+        return distance_to(other) > 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator<=(counting_iterator other) const
+    {
+        return distance_to(other) >= 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator>(counting_iterator other) const
+    {
+        return distance_to(other) < 0;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    bool operator>=(counting_iterator other) const
+    {
+        return distance_to(other) <= 0;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const counting_iterator& iter)
+    {
+        os << "[" << iter.value_ << "]";
+        return os;
+    }
+    //! \skip_doxy_end
+
+private:
+    template<class T>
+    inline
+    bool equal_value(const T& x, const T& y) const
+    {
+        return (x == y);
+    }
+
+    inline
+    difference_type distance_to(const counting_iterator& other) const
+    {
+        return difference_type(other.value_) - difference_type(value_);
+    }
+
+    value_type value_;
+};
+
+template<
+    class Incrementable,
+    class Difference
+>
+ROCPRIM_HOST_DEVICE inline
+counting_iterator<Incrementable, Difference>
+operator+(typename counting_iterator<Incrementable, Difference>::difference_type distance,
+          const counting_iterator<Incrementable, Difference>& iter)
+{
+    return iter + distance;
+}
+
+/// make_counting_iterator creates a counting_iterator with its initial value
+/// set to \p value.
+///
+/// \tparam Incrementable - type of value that can be obtained by dereferencing created iterator.
+/// \tparam Difference - a type used for identify distance between counting_iterator iterators.
+///
+/// \param value - initial value for counting_iterator.
+template<
+    class Incrementable,
+    class Difference = std::ptrdiff_t
+>
+ROCPRIM_HOST_DEVICE inline
+counting_iterator<Incrementable, Difference>
+make_counting_iterator(Incrementable value)
+{
+    return counting_iterator<Incrementable, Difference>(value);
+}
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group iteratormodule
+
+#endif // ROCPRIM_ITERATOR_COUNTING_ITERATOR_HPP_
--- a/3rdparty/cub/rocprim/iterator/detail/replace_first_iterator.hpp
+++ b/3rdparty/cub/rocprim/iterator/detail/replace_first_iterator.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_ITERATOR_REPLACE_FIRST_ITERATOR_HPP_
+#define ROCPRIM_ITERATOR_REPLACE_FIRST_ITERATOR_HPP_
+
+#include <iterator>
+#include <cstddef>
+#include <type_traits>
+
+#include "../../config.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+
+// Replaces first value of given range with given value. Used in exclusive scan-by-key
+// and exclusive segmented scan to avoid allocating additional memory and/or running
+// additional kernels.
+//
+// Important: it does not dereference the first item in given range, so it does not matter
+// if it's an invalid pointer.
+//
+// Usage:
+// * input - start of your input range
+// * value - value that should be used as first element of new range.
+//
+// replace_first_iterator<InputIterator>(input - 1, value);
+//
+// (input - 1) will never be dereferenced.
+template<class InputIterator>
+class replace_first_iterator
+{
+private:
+    using input_category = typename std::iterator_traits<InputIterator>::iterator_category;
+    static_assert(
+        std::is_same<input_category, std::random_access_iterator_tag>::value,
+        "InputIterator must be a random-access iterator"
+    );
+
+public:
+    using value_type = typename std::iterator_traits<InputIterator>::value_type;
+    using reference = value_type;
+    using pointer = const value_type*;
+    using difference_type = typename std::iterator_traits<InputIterator>::difference_type;
+    using iterator_category = std::random_access_iterator_tag;
+
+    ROCPRIM_HOST_DEVICE inline
+    ~replace_first_iterator() = default;
+
+    ROCPRIM_HOST_DEVICE inline
+    replace_first_iterator(InputIterator iterator, value_type value, size_t index = 0)
+        : iterator_(iterator), value_(value), index_(index)
+    {
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    replace_first_iterator& operator++()
+    {
+        iterator_++;
+        index_++;
+        return *this;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    replace_first_iterator operator++(int)
+    {
+        replace_first_iterator old = *this;
+        iterator_++;
+        index_++;
+        return old;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    value_type operator*() const
+    {
+        if(index_ == 0)
+        {
+            return value_;
+        }
+        return *iterator_;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    value_type operator[](difference_type distance) const
+    {
+        replace_first_iterator i = (*this) + distance;
+        return *i;
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    replace_first_iterator operator+(difference_type distance) const
+    {
+        return replace_first_iterator(iterator_ + distance, value_, index_ + distance);
+    }
+
+    ROCPRIM_HOST_DEVICE inline
+    replace_first_iterator& operator+=(difference_type distance)
+    {
+        iterator_ += distance;
+        index_ += distance;
+        return *this;
+    }
+
+private:
+    InputIterator iterator_;
+    value_type value_;
+    size_t index_;
+};
+
+} // end of detail namespace
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_ITERATOR_REPLACE_FIRST_ITERATOR_HPP_