添加dtk中的cub头文件

f8a481f8 · zhouxiang · 7b7c64c5 · f8a481f8 · f8a481f8 · f8a481f8
Commit f8a481f8 authored Oct 13, 2023 by zhouxiang
20 changed files
--- a/3rdparty/cub/rocprim/device/device_histogram_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_histogram_config.hpp
+// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_HISTOGRAM_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_HISTOGRAM_CONFIG_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "config_types.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief Configuration of device-level histogram operation.
+///
+/// \tparam HistogramConfig - configuration of histogram kernel. Must be \p kernel_config.
+/// \tparam MaxGridSize - maximim number of blocks to launch.
+/// \tparam SharedImplMaxBins - maximum total number of bins for all active channels
+/// for the shared memory histogram implementation (samples -> shared memory bins -> global memory bins),
+/// when exceeded the global memory implementation is used (samples -> global memory bins).
+template<
+    class HistogramConfig,
+    unsigned int MaxGridSize = 1024,
+    unsigned int SharedImplMaxBins = 2048
+>
+struct histogram_config
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    using histogram = HistogramConfig;
+    static constexpr unsigned int max_grid_size = MaxGridSize;
+    static constexpr unsigned int shared_impl_max_bins = SharedImplMaxBins;
+#endif
+};
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+template<
+    class HistogramConfig,
+    unsigned int MaxGridSize,
+    unsigned int SharedImplMaxBins
+> constexpr unsigned int
+histogram_config<HistogramConfig, MaxGridSize, SharedImplMaxBins>::max_grid_size;
+template<
+    class HistogramConfig,
+    unsigned int MaxGridSize,
+    unsigned int SharedImplMaxBins
+> constexpr unsigned int
+histogram_config<HistogramConfig, MaxGridSize, SharedImplMaxBins>::shared_impl_max_bins;
+#endif
+namespace detail
+{
+template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
+struct histogram_config_803
+{
+    static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int));
+    using type = histogram_config<kernel_config<256, ::rocprim::max(10u / Channels / item_scale, 1u)>>;
+};
+template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
+struct histogram_config_900
+{
+    static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int));
+    using type = histogram_config<kernel_config<256, ::rocprim::max(8u / Channels / item_scale, 1u)>>;
+};
+// TODO: We need to update these parameters
+template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
+struct histogram_config_90a
+{
+    static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int));
+    using type = histogram_config<kernel_config<256, ::rocprim::max(8u / Channels / item_scale, 1u)>>;
+};
+// TODO: We need to update these parameters
+template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
+struct histogram_config_1030
+{
+    static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int));
+    using type = histogram_config<kernel_config<256, ::rocprim::max(8u / Channels / item_scale, 1u)>>;
+};
+template<unsigned int TargetArch, class Sample, unsigned int Channels, unsigned int ActiveChannels>
+struct default_histogram_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<803, histogram_config_803<Sample, Channels, ActiveChannels> >,
+        select_arch_case<900, histogram_config_900<Sample, Channels, ActiveChannels> >,
+        select_arch_case<ROCPRIM_ARCH_90a, histogram_config_90a<Sample, Channels, ActiveChannels> >,
+        select_arch_case<1030, histogram_config_1030<Sample, Channels, ActiveChannels> >,
+        histogram_config_900<Sample, Channels, ActiveChannels>
+    > { };
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_HISTOGRAM_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_merge.hpp
+++ b/3rdparty/cub/rocprim/device/device_merge.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_MERGE_HPP_
+#define ROCPRIM_DEVICE_DEVICE_MERGE_HPP_
+#include <type_traits>
+#include <iterator>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "device_merge_config.hpp"
+#include "detail/device_merge.hpp"
+BEGIN_ROCPRIM_NAMESPACE
+/// \addtogroup devicemodule
+/// @{
+namespace detail
+{
+template<
+    class IndexIterator,
+    class KeysInputIterator1,
+    class KeysInputIterator2,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
+void partition_kernel(IndexIterator index,
+                      KeysInputIterator1 keys_input1,
+                      KeysInputIterator2 keys_input2,
+                      const size_t input1_size,
+                      const size_t input2_size,
+                      const unsigned int spacing,
+                      BinaryFunction compare_function)
+{
+    partition_kernel_impl(
+        index, keys_input1, keys_input2, input1_size, input2_size,
+        spacing, compare_function
+    );
+}
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class IndexIterator,
+    class KeysInputIterator1,
+    class KeysInputIterator2,
+    class KeysOutputIterator,
+    class ValuesInputIterator1,
+    class ValuesInputIterator2,
+    class ValuesOutputIterator,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void merge_kernel(IndexIterator index,
+                  KeysInputIterator1 keys_input1,
+                  KeysInputIterator2 keys_input2,
+                  KeysOutputIterator keys_output,
+                  ValuesInputIterator1 values_input1,
+                  ValuesInputIterator2 values_input2,
+                  ValuesOutputIterator values_output,
+                  const size_t input1_size,
+                  const size_t input2_size,
+                  BinaryFunction compare_function)
+{
+    merge_kernel_impl<BlockSize, ItemsPerThread>(
+        index, keys_input1, keys_input2, keys_output,
+        values_input1, values_input2, values_output,
+        input1_size, input2_size, compare_function
+    );
+}
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+template<
+    class Config,
+    class KeysInputIterator1,
+    class KeysInputIterator2,
+    class KeysOutputIterator,
+    class ValuesInputIterator1,
+    class ValuesInputIterator2,
+    class ValuesOutputIterator,
+    class BinaryFunction
+>
+inline
+cudaError_t merge_impl(void * temporary_storage,
+                      size_t& storage_size,
+                      KeysInputIterator1 keys_input1,
+                      KeysInputIterator2 keys_input2,
+                      KeysOutputIterator keys_output,
+                      ValuesInputIterator1 values_input1,
+                      ValuesInputIterator2 values_input2,
+                      ValuesOutputIterator values_output,
+                      const size_t input1_size,
+                      const size_t input2_size,
+                      BinaryFunction compare_function,
+                      const cudaStream_t stream,
+                      bool debug_synchronous)
+{
+    using key_type = typename std::iterator_traits<KeysInputIterator1>::value_type;
+    using value_type = typename std::iterator_traits<ValuesInputIterator1>::value_type;
+    // Get default config if Config is default_config
+    using config = detail::default_or_custom_config<
+        Config,
+        detail::default_merge_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
+    >;
+    static constexpr unsigned int block_size = config::block_size;
+    static constexpr unsigned int half_block = block_size / 2;
+    static constexpr unsigned int items_per_thread = config::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
+    const unsigned int partitions = ((input1_size + input2_size) + items_per_block - 1) / items_per_block;
+    const size_t partition_bytes = (partitions + 1) * sizeof(unsigned int);
+    if(temporary_storage == nullptr)
+    {
+        // storage_size is never zero
+        storage_size = partition_bytes;
+        return cudaSuccess;
+    }
+    if( partitions == 0u )
+        return cudaSuccess;
+    // Start point for time measurements
+    std::chrono::high_resolution_clock::time_point start;
+    auto number_of_blocks = partitions;
+    if(debug_synchronous)
+    {
+        std::cout << "block_size " << block_size << '\n';
+        std::cout << "number of blocks " << number_of_blocks << '\n';
+        std::cout << "items_per_block " << items_per_block << '\n';
+    }
+    unsigned int * index = reinterpret_cast<unsigned int *>(temporary_storage);
+    const unsigned partition_blocks = ((partitions + 1) + half_block - 1) / half_block;
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    detail::partition_kernel
+        <<<dim3(partition_blocks), dim3(half_block), 0, stream>>>(
+        index, keys_input1, keys_input2, input1_size, input2_size,
+        items_per_block, compare_function
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("partition_kernel", input1_size, start);
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    detail::merge_kernel<block_size, items_per_thread>
+        <<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
+        index, keys_input1, keys_input2, keys_output,
+        values_input1, values_input2, values_output,
+        input1_size, input2_size, compare_function
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("merge_kernel", input1_size, start);
+    return cudaSuccess;
+}
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+} // end of detail namespace
+/// \brief Parallel merge primitive for device level.
+///
+/// \p merge function performs a device-wide merge.
+/// Function merges two ordered sets of input values based on comparison function.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the merging function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Accepts custom compare_functions for merging across the device.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or
+/// a custom class with the same members.
+/// \tparam InputIterator1 - random-access iterator type of the first input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam InputIterator2 - random-access iterator type of the second input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input1 - iterator to the first element in the first range to merge.
+/// \param [in] input2 - iterator to the first element in the second range to merge.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [in] input1_size - number of element in the first input range.
+/// \param [in] input2_size - number of element in the second input range.
+/// \param [in] compare_function - binary operation function object that will be used for comparison.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending merge is performed on an array of
+/// \p int values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size1;     // e.g., 4
+/// size_t input_size2;     // e.g., 4
+/// int * input1;           // e.g., [0, 1, 2, 3]
+/// int * input2;           // e.g., [0, 1, 2, 3]
+/// int * output;           // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::merge(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input1, input2, output, input_size1, input_size2
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform merge
+/// rocprim::merge(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input1, input2, output, input_size1, input_size2
+/// );
+/// // output: [0, 0, 1, 1, 2, 2, 3, 3]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator1,
+    class InputIterator2,
+    class OutputIterator,
+    class BinaryFunction = ::rocprim::less<typename std::iterator_traits<InputIterator1>::value_type>
+>
+inline
+cudaError_t merge(void * temporary_storage,
+                 size_t& storage_size,
+                 InputIterator1 input1,
+                 InputIterator2 input2,
+                 OutputIterator output,
+                 const size_t input1_size,
+                 const size_t input2_size,
+                 BinaryFunction compare_function = BinaryFunction(),
+                 const cudaStream_t stream = 0,
+                 bool debug_synchronous = false)
+{
+    empty_type * values = nullptr;
+    return detail::merge_impl<Config>(
+        temporary_storage, storage_size,
+        input1, input2, output,
+        values, values, values,
+        input1_size, input2_size, compare_function,
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel merge primitive for device level.
+///
+/// \p merge function performs a device-wide merge of (key, value) pairs.
+/// Function merges two ordered sets of input keys and corresponding values
+/// based on key comparison function.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the merging function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Accepts custom compare_functions for merging across the device.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or
+/// a custom class with the same members.
+/// \tparam KeysInputIterator1 - random-access iterator type of the first keys input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysInputIterator2 - random-access iterator type of the second keys input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the keys output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesInputIterator1 - random-access iterator type of the first values input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesInputIterator2 - random-access iterator type of the second values input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesOutputIterator - random-access iterator type of the values output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input1 - iterator to the first key in the first range to merge.
+/// \param [in] keys_input2 - iterator to the first key in the second range to merge.
+/// \param [out] keys_output - iterator to the first key in the output range.
+/// \param [in] values_input1 - iterator to the first value in the first range to merge.
+/// \param [in] values_input2 - iterator to the first value in the second range to merge.
+/// \param [out] values_output - iterator to the first value in the output range.
+/// \param [in] input1_size - number of element in the first input range.
+/// \param [in] input2_size - number of element in the second input range.
+/// \param [in] compare_function - binary operation function object that will be used for key comparison.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending merge is performed on an array of
+/// \p int values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size1;     // e.g., 4
+/// size_t input_size2;     // e.g., 4
+/// int * keys_input1;      // e.g., [0, 1, 2, 3]
+/// int * keys_input2;      // e.g., [0, 1, 2, 3]
+/// int * keys_output;      // empty array of 8 elements
+/// int * values_input1;    // e.g., [10, 11, 12, 13]
+/// int * values_input2;    // e.g., [20, 21, 22, 23]
+/// int * values_output;    // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::merge(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input1, keys_input2, keys_output,
+///     values_input1, values_input2, values_output,
+//      input_size1, input_size2
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform merge
+/// rocprim::merge(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input1, keys_input2, keys_output,
+///     values_input1, values_input2, values_output,
+//      input_size1, input_size2
+/// );
+/// // keys_output: [0, 0, 1, 1, 2, 2, 3, 3]
+/// // values_output: [10, 20, 11, 21, 12, 22, 13, 23]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator1,
+    class KeysInputIterator2,
+    class KeysOutputIterator,
+    class ValuesInputIterator1,
+    class ValuesInputIterator2,
+    class ValuesOutputIterator,
+    class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator1>::value_type>
+>
+inline
+cudaError_t merge(void * temporary_storage,
+                 size_t& storage_size,
+                 KeysInputIterator1 keys_input1,
+                 KeysInputIterator2 keys_input2,
+                 KeysOutputIterator keys_output,
+                 ValuesInputIterator1 values_input1,
+                 ValuesInputIterator2 values_input2,
+                 ValuesOutputIterator values_output,
+                 const size_t input1_size,
+                 const size_t input2_size,
+                 BinaryFunction compare_function = BinaryFunction(),
+                 const cudaStream_t stream = 0,
+                 bool debug_synchronous = false)
+{
+    return detail::merge_impl<Config>(
+        temporary_storage, storage_size,
+        keys_input1, keys_input2, keys_output,
+        values_input1, values_input2, values_output,
+        input1_size, input2_size, compare_function,
+        stream, debug_synchronous
+    );
+}
+/// @}
+// end of group devicemodule
+END_ROCPRIM_NAMESPACE
+#endif // ROCPRIM_DEVICE_DEVICE_MERGE_HPP_
--- a/3rdparty/cub/rocprim/device/device_merge_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_merge_config.hpp
+// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_MERGE_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_MERGE_CONFIG_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "config_types.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief Configuration of device-level merge primitives.
+template<unsigned int BlockSize, unsigned int ItemsPerThread>
+using merge_config = kernel_config<BlockSize, ItemsPerThread>;
+namespace detail
+{
+template<class Key, class Value>
+struct merge_config_803
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    // TODO Tune when merge-by-key is ready
+    using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>;
+};
+template<class Key>
+struct merge_config_803<Key, empty_type>
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
+    using type = select_type<
+        select_type_case<sizeof(Key) <= 2, merge_config<256, 11> >,
+        select_type_case<sizeof(Key) <= 4, merge_config<256, 10> >,
+        select_type_case<sizeof(Key) <= 8, merge_config<256, 7> >,
+        merge_config<256, ::rocprim::max(1u, 10u / item_scale)>
+    >;
+};
+template<class Key, class Value>
+struct merge_config_900
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    // TODO Tune when merge-by-key is ready
+    using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>;
+};
+template<class Key>
+struct merge_config_900<Key, empty_type>
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
+    using type = select_type<
+        select_type_case<sizeof(Key) <= 2, merge_config<256, 11> >,
+        select_type_case<sizeof(Key) <= 4, merge_config<256, 10> >,
+        select_type_case<sizeof(Key) <= 8, merge_config<256, 7> >,
+        merge_config<256, ::rocprim::max(1u, 10u / item_scale)>
+    >;
+};
+// TODO: We need to update these parameters
+template<class Key, class Value>
+struct merge_config_90a
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    // TODO Tune when merge-by-key is ready
+    using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>;
+};
+template<class Key>
+struct merge_config_90a<Key, empty_type>
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
+    using type = select_type<
+        select_type_case<sizeof(Key) <= 2, merge_config<256, 11> >,
+        select_type_case<sizeof(Key) <= 4, merge_config<256, 10> >,
+        select_type_case<sizeof(Key) <= 8, merge_config<256, 7> >,
+        merge_config<256, ::rocprim::max(1u, 10u / item_scale)>
+    >;
+};
+// TODO: We need to update these parameters
+template<class Key, class Value>
+struct merge_config_1030
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    // TODO Tune when merge-by-key is ready
+    using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>;
+};
+template<class Key>
+struct merge_config_1030<Key, empty_type>
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
+    using type = select_type<
+        select_type_case<sizeof(Key) <= 2, merge_config<256, 11> >,
+        select_type_case<sizeof(Key) <= 4, merge_config<256, 10> >,
+        select_type_case<sizeof(Key) <= 8, merge_config<256, 7> >,
+        merge_config<256, ::rocprim::max(1u, 10u / item_scale)>
+    >;
+};
+template<unsigned int TargetArch, class Key, class Value>
+struct default_merge_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<803, merge_config_803<Key, Value>>,
+        select_arch_case<900, merge_config_900<Key, Value>>,
+        select_arch_case<ROCPRIM_ARCH_90a, merge_config_90a<Key, Value>>,
+        select_arch_case<1030, merge_config_1030<Key, Value>>,
+        merge_config_900<Key, Value>
+    > { };
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_MERGE_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_merge_sort.hpp
+++ b/3rdparty/cub/rocprim/device/device_merge_sort.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_SORT_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SORT_HPP_
+#include <type_traits>
+#include <iterator>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "detail/device_merge.hpp"
+#include "detail/device_merge_sort.hpp"
+#include "detail/device_merge_sort_mergepath.hpp"
+#include "device_transform.hpp"
+#include "device_merge_sort_config.hpp"
+BEGIN_ROCPRIM_NAMESPACE
+/// \addtogroup devicemodule
+/// @{
+namespace detail
+{
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class OffsetT,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void block_sort_kernel(KeysInputIterator keys_input,
+                       KeysOutputIterator keys_output,
+                       ValuesInputIterator values_input,
+                       ValuesOutputIterator values_output,
+                       const OffsetT size,
+                       BinaryFunction compare_function)
+{
+    block_sort_kernel_impl<BlockSize, ItemsPerThread>(
+        keys_input, keys_output, values_input, values_output,
+        size, compare_function
+    );
+}
+template<
+    unsigned int BlockSize,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class OffsetT,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void block_merge_kernel(KeysInputIterator keys_input,
+                        KeysOutputIterator keys_output,
+                        ValuesInputIterator values_input,
+                        ValuesOutputIterator values_output,
+                        const OffsetT input_size,
+                        const OffsetT sorted_block_size,
+                        BinaryFunction compare_function)
+{
+    block_merge_kernel_impl<BlockSize>(keys_input,
+                                       keys_output,
+                                       values_input,
+                                       values_output,
+                                       input_size,
+                                       sorted_block_size,
+                                       compare_function);
+}
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class OffsetT,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void block_merge_kernel(KeysInputIterator keys_input,
+                        KeysOutputIterator keys_output,
+                        ValuesInputIterator values_input,
+                        ValuesOutputIterator values_output,
+                        const OffsetT input_size,
+                        const OffsetT sorted_block_size,
+                        BinaryFunction compare_function,
+                        const OffsetT* merge_partitions)
+{
+    block_merge_kernel_impl<BlockSize, ItemsPerThread>(keys_input,
+                                                       keys_output,
+                                                       values_input,
+                                                       values_output,
+                                                       input_size,
+                                                       sorted_block_size,
+                                                       compare_function,
+                                                       merge_partitions);
+}
+#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
+    if(debug_synchronous) \
+    { \
+        std::cout << name << "(" << size << ")"; \
+        auto error = cudaStreamSynchronize(stream); \
+        if(error != cudaSuccess) return error; \
+        auto end = std::chrono::high_resolution_clock::now(); \
+        auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
+        std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+    }
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+template <unsigned int BlockSize, // BlockSize of the partition kernel
+          unsigned int ItemsPerTile, // ItemsPerTile of the block merge kernel
+          typename KeysInputIterator,
+          typename OffsetT,
+          typename CompareOpT>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void device_mergepath_partition_kernel(KeysInputIterator keys,
+                             const OffsetT input_size,
+                             const unsigned int num_partitions,
+                             OffsetT *merge_partitions,
+                             const CompareOpT compare_op,
+                             const OffsetT sorted_block_size)
+{
+    const OffsetT partition_id = blockIdx.x * BlockSize + threadIdx.x;
+    if (partition_id >= num_partitions)
+    {
+        return;
+    }
+    const unsigned int merged_tiles = sorted_block_size / ItemsPerTile;
+    const unsigned int target_merged_tiles = merged_tiles * 2;
+    const unsigned int mask = target_merged_tiles - 1;
+    const unsigned int tilegroup_start_id = ~mask & partition_id; // id of the first tile in the current tile-group
+    const OffsetT tilegroup_start = ItemsPerTile * tilegroup_start_id; // index of the first item in the current tile-group
+    const unsigned int local_tile_id = mask & partition_id; // id of the current tile in the current tile-group
+    const OffsetT keys1_beg = rocprim::min(input_size, tilegroup_start);
+    const OffsetT keys1_end = rocprim::min(input_size, tilegroup_start + sorted_block_size);
+    const OffsetT keys2_beg = keys1_end;
+    const OffsetT keys2_end = rocprim::min(input_size, keys2_beg + sorted_block_size);
+    const OffsetT partition_at = rocprim::min<OffsetT>(keys2_end - keys1_beg, ItemsPerTile * local_tile_id);
+    const OffsetT partition_diag = ::rocprim::detail::merge_path(keys + keys1_beg,
+                                                                 keys + keys2_beg,
+                                                                 keys1_end - keys1_beg,
+                                                                 keys2_end - keys2_beg,
+                                                                 partition_at,
+                                                                 compare_op);
+    merge_partitions[partition_id] = keys1_beg + partition_diag;
+}
+template<
+    class Config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class BinaryFunction
+>
+inline
+cudaError_t merge_sort_impl(void * temporary_storage,
+                           size_t& storage_size,
+                           KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           ValuesOutputIterator values_output,
+                           const unsigned int size,
+                           BinaryFunction compare_function,
+                           const cudaStream_t stream,
+                           bool debug_synchronous)
+{
+    using OffsetT = unsigned int;
+    using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+    using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
+    constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
+    // Get default config if Config is default_config
+    using config = default_or_custom_config<
+        Config,
+        default_merge_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
+    >;
+    static constexpr unsigned int sort_block_size = config::sort_config::block_size;
+    static constexpr unsigned int sort_items_per_thread = config::sort_config::items_per_thread;
+    static constexpr unsigned int sort_items_per_block = sort_block_size * sort_items_per_thread;
+    static constexpr unsigned int merge_impl1_block_size = config::merge_impl1_config::block_size;
+    static constexpr unsigned int merge_impl1_items_per_thread = config::merge_impl1_config::items_per_thread;
+    static constexpr unsigned int merge_impl1_items_per_block = merge_impl1_block_size * merge_impl1_items_per_thread;
+    static constexpr unsigned int merge_partition_block_size = config::merge_mergepath_partition_config::block_size;
+    static constexpr unsigned int merge_mergepath_block_size = config::merge_mergepath_config::block_size;
+    static constexpr unsigned int merge_mergepath_items_per_thread = config::merge_mergepath_config::items_per_thread;
+    static constexpr unsigned int merge_mergepath_items_per_block = merge_mergepath_block_size * merge_mergepath_items_per_thread;
+    static_assert(merge_mergepath_items_per_block >= sort_items_per_block,
+                  "merge_mergepath_items_per_block must be greater than or equal to sort_items_per_block");
+    static_assert(sort_items_per_block % config::merge_impl1_config::block_size == 0,
+                  "Merge block size must be a divisor of the items per block of the sort step");
+    const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type));
+    const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0;
+    const unsigned int sort_number_of_blocks = ceiling_div(size, sort_items_per_block);
+    const unsigned int merge_impl1_number_of_blocks = ceiling_div(size, merge_impl1_items_per_block);
+    const unsigned int merge_mergepath_number_of_blocks = ceiling_div(size, merge_mergepath_items_per_block);
+    bool use_mergepath = size > config::min_input_size_mergepath;
+    // variables below used for mergepath
+    const unsigned int merge_num_partitions = merge_mergepath_number_of_blocks + 1;
+    const unsigned int merge_partition_number_of_blocks = ceiling_div(merge_num_partitions, merge_partition_block_size);
+    const size_t d_merge_partitions_bytes = use_mergepath ? merge_num_partitions * sizeof(OffsetT) : 0;
+    if(temporary_storage == nullptr)
+    {
+        storage_size = d_merge_partitions_bytes + keys_bytes + values_bytes;
+        // Make sure user won't try to allocate 0 bytes memory
+        storage_size = storage_size == 0 ? 4 : storage_size;
+        return cudaSuccess;
+    }
+    if( size == size_t(0) )
+        return cudaSuccess;
+    if(debug_synchronous)
+    {
+        std::cout << "-----" << '\n';
+        std::cout << "size: " << size << '\n';
+        std::cout << "sort_block_size: " << sort_block_size << '\n';
+        std::cout << "sort_items_per_thread: " << sort_items_per_thread << '\n';
+        std::cout << "sort_items_per_block: " << sort_items_per_block << '\n';
+        std::cout << "sort_number_of_blocks: " << sort_number_of_blocks << '\n';
+        std::cout << "merge_impl1_block_size: " << merge_impl1_block_size << '\n';
+        std::cout << "merge_impl1_number_of_blocks: " << merge_impl1_number_of_blocks << '\n';
+        std::cout << "merge_impl1_items_per_thread: " << merge_impl1_items_per_thread << '\n';
+        std::cout << "merge_impl1_items_per_block: " << merge_impl1_items_per_block << '\n';
+        std::cout << "merge_mergepath_block_size: " << merge_mergepath_block_size << '\n';
+        std::cout << "merge_mergepath_number_of_blocks: " << merge_mergepath_number_of_blocks << '\n';
+        std::cout << "merge_mergepath_items_per_thread: " << merge_mergepath_items_per_thread << '\n';
+        std::cout << "merge_mergepath_items_per_block: " << merge_mergepath_items_per_block << '\n';
+        std::cout << "num_partitions: " << merge_num_partitions << '\n';
+        std::cout << "merge_mergepath_partition_block_size: " << merge_partition_block_size << '\n';
+        std::cout << "merge_mergepath_partition_number_of_blocks: " << merge_partition_number_of_blocks << '\n';
+    }
+    char* ptr = reinterpret_cast<char*>(temporary_storage);
+    OffsetT* d_merge_partitions = reinterpret_cast<OffsetT*>(ptr);
+    ptr += d_merge_partitions_bytes;
+    key_type * keys_buffer = reinterpret_cast<key_type*>(ptr);
+    ptr += keys_bytes;
+    value_type * values_buffer = with_values ? reinterpret_cast<value_type*>(ptr) : nullptr;
+    // Start point for time measurements
+    std::chrono::high_resolution_clock::time_point start;
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    block_sort_kernel<sort_block_size, sort_items_per_thread>
+        <<<dim3(sort_number_of_blocks), dim3(sort_block_size), 0, stream>>>(
+        keys_input, keys_buffer, values_input, values_buffer,
+        size, compare_function
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_sort_kernel", size, start);
+    bool temporary_store = true;
+    for(OffsetT block = sort_items_per_block; block < size; block *= 2)
+    {
+        temporary_store = !temporary_store;
+        const auto merge_step = [&](auto keys_input_,
+                                    auto keys_output_,
+                                    auto values_input_,
+                                    auto values_output_) -> cudaError_t {
+            if(use_mergepath)
+            {
+                if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+                device_mergepath_partition_kernel<merge_partition_block_size, merge_mergepath_items_per_block>
+                                   <<<dim3(merge_partition_number_of_blocks), dim3(merge_partition_block_size), 0, stream>>>(
+                                   keys_input_, size, merge_num_partitions, d_merge_partitions,
+                                   compare_function, block);
+                ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("device_mergepath_partition_kernel", size, start);
+                if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+                block_merge_kernel<merge_mergepath_block_size, merge_mergepath_items_per_thread>
+                    <<<dim3(merge_mergepath_number_of_blocks), dim3(merge_mergepath_block_size), 0, stream>>>(
+                    keys_input_, keys_output_, values_input_, values_output_,
+                    size, block, compare_function, d_merge_partitions
+                );
+                ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start);
+            }
+            else
+            {
+                if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+                block_merge_kernel<merge_impl1_block_size>
+                    <<<dim3(merge_impl1_number_of_blocks), dim3(merge_impl1_block_size), 0, stream>>>(
+                    keys_input_, keys_output_, values_input_, values_output_,
+                    size, block, compare_function
+                );
+                ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start)
+            }
+            return cudaSuccess;
+        };
+        cudaError_t error;
+        if(temporary_store)
+        {
+            error = merge_step(keys_output, keys_buffer, values_output, values_buffer);
+        }
+        else
+        {
+            error = merge_step(keys_buffer, keys_output, values_buffer, values_output);
+        }
+        if(error != cudaSuccess) return error;
+    }
+    if(temporary_store)
+    {
+        cudaError_t error = ::rocprim::transform(
+            keys_buffer, keys_output, size,
+            ::rocprim::identity<key_type>(), stream, debug_synchronous
+        );
+        if(error != cudaSuccess) return error;
+        if(with_values)
+        {
+            cudaError_t error = ::rocprim::transform(
+                values_buffer, values_output, size,
+                ::rocprim::identity<value_type>(), stream, debug_synchronous
+            );
+            if(error != cudaSuccess) return error;
+        }
+    }
+    return cudaSuccess;
+}
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+#undef ROCPRIM_DETAIL_HIP_SYNC
+} // end of detail namespace
+/// \brief Parallel merge sort primitive for device level.
+///
+/// \p merge_sort function performs a device-wide merge sort
+/// of keys. Function sorts input keys based on comparison function.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Accepts custom compare_functions for sorting across the device.
+///
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] compare_function - binary operation function object that will be used for comparison.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending merge sort is performed on an array of
+/// \p float values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;      // e.g., 8
+/// float * input;          // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
+/// float * output;         // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::merge_sort(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::merge_sort(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size
+/// );
+/// // keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator>::value_type>
+>
+inline
+cudaError_t merge_sort(void * temporary_storage,
+                      size_t& storage_size,
+                      KeysInputIterator keys_input,
+                      KeysOutputIterator keys_output,
+                      const size_t size,
+                      BinaryFunction compare_function = BinaryFunction(),
+                      const cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+{
+    empty_type * values = nullptr;
+    return detail::merge_sort_impl<Config>(
+        temporary_storage, storage_size,
+        keys_input, keys_output, values, values, size,
+        compare_function, stream, debug_synchronous
+    );
+}
+/// \brief Parallel ascending merge sort-by-key primitive for device level.
+///
+/// \p merge_sort function performs a device-wide merge sort
+/// of (key, value) pairs. Function sorts input pairs based on comparison function.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Accepts custom compare_functions for sorting across the device.
+///
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] values_input - pointer to the first element in the range to sort.
+/// \param [out] values_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] compare_function - binary operation function object that will be used for comparison.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending merge sort is performed where input keys are
+/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;          // e.g., 8
+/// unsigned int * keys_input;  // e.g., [ 6, 3,  5, 4,  1,  8,  2, 7]
+/// double * values_input;      // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
+/// unsigned int * keys_output; // empty array of 8 elements
+/// double * values_output;     // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::merge_sort(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output,
+///     input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::merge_sort(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output,
+///     input_size
+/// );
+/// // keys_output:   [ 1,  2, 3, 4,  5,  6, 7,  8]
+/// // values_output: [-1, -2, 2, 3, -4, -5, 7, -8]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator>::value_type>
+>
+inline
+cudaError_t merge_sort(void * temporary_storage,
+                      size_t& storage_size,
+                      KeysInputIterator keys_input,
+                      KeysOutputIterator keys_output,
+                      ValuesInputIterator values_input,
+                      ValuesOutputIterator values_output,
+                      const size_t size,
+                      BinaryFunction compare_function = BinaryFunction(),
+                      const cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+{
+    return detail::merge_sort_impl<Config>(
+        temporary_storage, storage_size,
+        keys_input, keys_output, values_input, values_output, size,
+        compare_function, stream, debug_synchronous
+    );
+}
+/// @}
+// end of group devicemodule
+END_ROCPRIM_NAMESPACE
+#endif // ROCPRIM_DEVICE_DEVICE_SORT_HPP_
--- a/3rdparty/cub/rocprim/device/device_merge_sort_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_merge_sort_config.hpp
+// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_MERGE_SORT_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_MERGE_SORT_CONFIG_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../functional.hpp"
+#include "config_types.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+namespace detail
+{
+    template <unsigned int SortBlockSize,
+              unsigned int SortItemsPerThread,
+              unsigned int MergeImpl1BlockSize,
+              unsigned int MergeImplMPPartitionBlockSize,
+              unsigned int MergeImplMPBlockSize,
+              unsigned int MergeImplMPItemsPerThread,
+              unsigned int MinInputSizeMergepath>
+    struct merge_sort_config_impl
+    {
+        using sort_config                      = kernel_config<SortBlockSize, SortItemsPerThread>;
+        using merge_impl1_config               = kernel_config<MergeImpl1BlockSize, 1>;
+        using merge_mergepath_partition_config = kernel_config<MergeImplMPPartitionBlockSize, 1>;
+        using merge_mergepath_config
+            = kernel_config<MergeImplMPBlockSize, MergeImplMPItemsPerThread>;
+        static constexpr unsigned int min_input_size_mergepath = MinInputSizeMergepath;
+    };
+}
+/// \brief Configuration of device-level merge primitives.
+///
+/// \tparam SortBlockSize - block size in the block-sort step
+/// \tparam SortItemsPerThread - ItemsPerThread in the block-sort step
+/// \tparam MergeImpl1BlockSize - block size in the block merge step using impl1 (used when input_size < MinInputSizeMergepath)
+/// \tparam MergeImplMPPartitionBlockSize - block size of the partition kernel in the block merge step using mergepath impl
+/// \tparam MergeImplMPBlockSize - block size in the block merge step using mergepath impl
+/// \tparam MergeImplMPItemsPerThread - ItemsPerThread in the block merge step using mergepath impl
+/// \tparam MinInputSizeMergepath - breakpoint of input-size to use mergepath impl for block merge step
+template<unsigned int     MergeImpl1BlockSize           = 512,
+         unsigned int     SortBlockSize                 = MergeImpl1BlockSize,
+         unsigned int     SortItemsPerThread            = 1,
+         unsigned int     MergeImplMPPartitionBlockSize = 128,
+         unsigned int     MergeImplMPBlockSize          = std::min(SortBlockSize, 128u),
+         unsigned int     MergeImplMPItemsPerThread
+         = SortBlockSize* SortItemsPerThread / MergeImplMPBlockSize,
+         unsigned int     MinInputSizeMergepath = 200000>
+using merge_sort_config = detail::merge_sort_config_impl<SortBlockSize,
+                                                         SortItemsPerThread,
+                                                         MergeImpl1BlockSize,
+                                                         MergeImplMPPartitionBlockSize,
+                                                         MergeImplMPBlockSize,
+                                                         MergeImplMPItemsPerThread,
+                                                         MinInputSizeMergepath>;
+namespace detail
+{
+template<class Key, class Value>
+struct merge_sort_config_803
+{
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            merge_sort_config<64U>
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            merge_sort_config<256U>
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            merge_sort_config<512U>
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            merge_sort_config<1024U>
+        >,
+        merge_sort_config<limit_block_size<1024U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value>
+    >;
+};
+template<class Value>
+struct merge_sort_config_803<rocprim::half, Value>
+{
+    using type = merge_sort_config<limit_block_size<256U, sizeof(rocprim::half) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value>;
+};
+template<class Key>
+struct merge_sort_config_803<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, merge_sort_config<64U> >,
+        select_type_case<sizeof(Key) == 2, merge_sort_config<256U> >,
+        select_type_case<sizeof(Key) == 4, merge_sort_config<256U> >,
+        select_type_case<sizeof(Key) >= 8, merge_sort_config<limit_block_size<512U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value> >
+    > { };
+template<>
+struct merge_sort_config_803<rocprim::half, empty_type>
+{
+    using type = merge_sort_config<256U>;
+};
+template<class Key, class Value, bool = is_scalar<Key>::value>
+struct merge_sort_config_900
+{
+    using type = select_type<
+        // clang-format off
+        select_type_case<(sizeof(Key) == 1 && sizeof(Value) <= 16), merge_sort_config<512U, 512U, 2U>>,
+        select_type_case<(sizeof(Key) == 2 && sizeof(Value) <= 16), merge_sort_config<512U, 256U, 4U>>,
+        select_type_case<(sizeof(Key) == 4 && sizeof(Value) <= 16), merge_sort_config<512U, 256U, 4U>>,
+        select_type_case<(sizeof(Key) == 8 && sizeof(Value) <= 16), merge_sort_config<256U, 256U, 4U>>,
+        // clang-format on
+        merge_sort_config<
+            limit_block_size<1024U,
+                             ::rocprim::max(sizeof(Key) + sizeof(unsigned int), sizeof(Value)),
+                             ROCPRIM_WARP_SIZE_64>::value>>;
+};
+template<class Key, class Value>
+struct merge_sort_config_900<Key, Value, false>
+{
+    using type = select_type<
+        // clang-format off
+        select_type_case<(sizeof(Key) == 8  && sizeof(Value) <= 16), merge_sort_config<512U, 512U, 2U>>,
+        select_type_case<(sizeof(Key) == 16 && sizeof(Value) <= 16), merge_sort_config<512U, 512U, 2U>>,
+        // clang-format on
+        merge_sort_config<
+            limit_block_size<512U,
+                             ::rocprim::max(sizeof(Key) + sizeof(unsigned int), sizeof(Value)),
+                             ROCPRIM_WARP_SIZE_64>::value>>;
+};
+// TODO: We need to update these parameters
+template<class Key, class Value>
+struct merge_sort_config_1030
+{
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            merge_sort_config<64U>
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            merge_sort_config<256U>
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            merge_sort_config<512U>
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            merge_sort_config<1024U>
+        >,
+        merge_sort_config<limit_block_size<1024U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value>
+    >;
+};
+template<class Value>
+struct merge_sort_config_1030<rocprim::half, Value>
+{
+    using type = merge_sort_config<limit_block_size<256U, sizeof(rocprim::half) + sizeof(Value), ROCPRIM_WARP_SIZE_32>::value>;
+};
+template<class Key>
+struct merge_sort_config_1030<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, merge_sort_config<64U> >,
+        select_type_case<sizeof(Key) == 2, merge_sort_config<256U> >,
+        select_type_case<sizeof(Key) == 4, merge_sort_config<256U> >,
+        select_type_case<sizeof(Key) >= 8, merge_sort_config<limit_block_size<512U, sizeof(Key), ROCPRIM_WARP_SIZE_32>::value> >
+    > { };
+template<>
+struct merge_sort_config_1030<rocprim::half, empty_type>
+{
+    using type = merge_sort_config<256U>;
+};
+template<unsigned int TargetArch, class Key, class Value>
+struct default_merge_sort_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<803, merge_sort_config_803<Key, Value>>,
+        select_arch_case<900, merge_sort_config_900<Key, Value>>,
+        select_arch_case<1030, merge_sort_config_1030<Key, Value>>,
+        merge_sort_config_900<Key, Value>
+    > { };
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_MERGE_SORT_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_partition.hpp
+++ b/3rdparty/cub/rocprim/device/device_partition.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_
+#define ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_
+#include <algorithm>
+#include <type_traits>
+#include <iterator>
+#include "../config.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+#include "../type_traits.hpp"
+#include "../detail/various.hpp"
+#include "device_select_config.hpp"
+#include "detail/device_scan_common.hpp"
+#include "detail/device_partition.hpp"
+#include "device_transform.hpp"
+BEGIN_ROCPRIM_NAMESPACE
+/// \addtogroup devicemodule
+/// @{
+namespace detail
+{
+template<
+    select_method SelectMethod,
+    bool OnlySelected,
+    class Config,
+    class KeyIterator,
+    class ValueIterator,
+    class FlagIterator,
+    class OutputKeyIterator,
+    class OutputValueIterator,
+    class InequalityOp,
+    class OffsetLookbackScanState,
+    class... UnaryPredicates
+>
+ROCPRIM_KERNEL
+__launch_bounds__(Config::block_size)
+void partition_kernel(KeyIterator keys_input,
+                      ValueIterator values_input,
+                      FlagIterator flags,
+                      OutputKeyIterator keys_output,
+                      OutputValueIterator values_output,
+                      size_t* selected_count,
+                      size_t* prev_selected_count,
+                      const size_t size,
+                      InequalityOp inequality_op,
+                      OffsetLookbackScanState offset_scan_state,
+                      const unsigned int number_of_blocks,
+                      ordered_block_id<unsigned int> ordered_bid,
+                      UnaryPredicates... predicates)
+{
+    partition_kernel_impl<SelectMethod, OnlySelected, Config>(
+        keys_input, values_input, flags, keys_output, values_output, selected_count, prev_selected_count, 
+        size, inequality_op, offset_scan_state, number_of_blocks, ordered_bid, predicates...
+    );
+}
+#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
+    if(debug_synchronous) \
+    { \
+        std::cout << name << "(" << size << ")"; \
+        auto error = cudaStreamSynchronize(stream); \
+        if(error != cudaSuccess) return error; \
+        auto end = std::chrono::high_resolution_clock::now(); \
+        auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
+        std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+    }
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+template<
+    // Method of selection: flag, predicate, unique
+    select_method SelectMethod,
+     // if true, it doesn't copy rejected values to output
+    bool OnlySelected,
+    class Config,
+    class OffsetT,
+    class KeyIterator,
+    class ValueIterator, // can be rocprim::empty_type* for key only
+    class FlagIterator,
+    class OutputKeyIterator,
+    class OutputValueIterator, // can be rocprim::empty_type* for key only
+    class InequalityOp,
+    class SelectedCountOutputIterator,
+    class... UnaryPredicates
+>
+inline
+cudaError_t partition_impl(void * temporary_storage,
+                          size_t& storage_size,
+                          KeyIterator keys_input,
+                          ValueIterator values_input,
+                          FlagIterator flags,
+                          OutputKeyIterator keys_output,
+                          OutputValueIterator values_output,
+                          SelectedCountOutputIterator selected_count_output,
+                          const size_t size,
+                          InequalityOp inequality_op,
+                          const cudaStream_t stream,
+                          bool debug_synchronous,
+                          UnaryPredicates... predicates)
+{
+    using offset_type = OffsetT;
+    using key_type = typename std::iterator_traits<KeyIterator>::value_type;
+    using value_type = typename std::iterator_traits<ValueIterator>::value_type;
+    // Get default config if Config is default_config
+    using config = default_or_custom_config<
+        Config,
+        default_select_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
+    >;
+    using offset_scan_state_type = detail::lookback_scan_state<offset_type>;
+    using offset_scan_state_with_sleep_type = detail::lookback_scan_state<offset_type, true>;
+    using ordered_block_id_type = detail::ordered_block_id<unsigned int>;
+    static constexpr unsigned int block_size = config::block_size;
+    static constexpr unsigned int items_per_thread = config::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
+    static constexpr bool is_three_way = sizeof...(UnaryPredicates) == 2;
+    static constexpr size_t size_limit = config::size_limit;
+    static constexpr size_t aligned_size_limit = ::rocprim::max<size_t>(size_limit - (size_limit % items_per_block), items_per_block);
+    const size_t limited_size = std::min<size_t>(size, aligned_size_limit);
+    const bool use_limited_size = limited_size == aligned_size_limit;
+    const unsigned int number_of_blocks = 
+        static_cast<unsigned int>(::rocprim::detail::ceiling_div(limited_size, items_per_block));
+    // Calculate required temporary storage
+    size_t offset_scan_state_bytes = ::rocprim::detail::align_size(
+        // This is valid even with offset_scan_state_with_sleep_type
+        offset_scan_state_type::get_storage_size(number_of_blocks)
+    );
+    size_t ordered_block_id_bytes = ::rocprim::detail::align_size(
+        ordered_block_id_type::get_storage_size(),
+        alignof(size_t)
+    );
+    if(temporary_storage == nullptr)
+    {
+        // storage_size is never zero
+        storage_size = offset_scan_state_bytes + ordered_block_id_bytes + (sizeof(size_t) * 2 * (is_three_way ? 2 : 1));
+        return cudaSuccess;
+    }
+    // Start point for time measurements
+    std::chrono::high_resolution_clock::time_point start;
+    // Create and initialize lookback_scan_state obj
+    auto offset_scan_state = offset_scan_state_type::create(
+        temporary_storage, number_of_blocks
+    );
+    auto offset_scan_state_with_sleep = offset_scan_state_with_sleep_type::create(
+        temporary_storage, number_of_blocks
+    );
+    // Create ad initialize ordered_block_id obj
+    auto ptr = reinterpret_cast<char*>(temporary_storage);
+    auto ordered_bid = ordered_block_id_type::create(
+        reinterpret_cast<ordered_block_id_type::id_type*>(ptr + offset_scan_state_bytes)
+    );
+    size_t* selected_count = reinterpret_cast<size_t*>(ptr + offset_scan_state_bytes
+                                                       + ordered_block_id_bytes);
+    size_t* prev_selected_count
+        = reinterpret_cast<size_t*>(ptr + offset_scan_state_bytes + ordered_block_id_bytes
+                                    + (is_three_way ? 2 : 1) * sizeof(size_t));
+    cudaError_t error;
+    // Memset selected_count and prev_selected_count at once
+    error = cudaMemsetAsync(selected_count,
+                           0,
+                           sizeof(*selected_count) * 2 * (is_three_way ? 2 : 1),
+                           stream);
+    if (error != cudaSuccess) return error;
+    cudaDeviceProp prop;
+    int deviceId;
+    static_cast<void>(cudaGetDevice(&deviceId));
+    static_cast<void>(cudaGetDeviceProperties(&prop, deviceId));
+    int asicRevision = 0;
+    const size_t number_of_launches = ::rocprim::detail::ceiling_div(size, aligned_size_limit);
+    if(debug_synchronous)
+    {
+        std::cout << "use_limited_size " << use_limited_size << '\n';
+        std::cout << "aligned_size_limit " << aligned_size_limit << '\n';
+        std::cout << "number_of_launches " << number_of_launches << '\n';
+        std::cout << "size " << size << '\n';
+        std::cout << "block_size " << block_size << '\n';
+        std::cout << "number of blocks " << number_of_blocks << '\n';
+        std::cout << "items_per_block " << items_per_block << '\n';
+    }
+    for (size_t i = 0, offset = 0; i < number_of_launches; i++, offset+=limited_size)
+    {
+        const unsigned int current_size = static_cast<unsigned int>(std::min<size_t>(size - offset, limited_size));
+        const unsigned int current_number_of_blocks = ::rocprim::detail::ceiling_div(current_size, items_per_block);
+        auto grid_size = ::rocprim::detail::ceiling_div(number_of_blocks, block_size);
+        if(debug_synchronous)
+        {
+            std::cout << "current size " << current_size << '\n';
+            std::cout << "current number of blocks " << current_number_of_blocks << '\n';
+            start = std::chrono::high_resolution_clock::now();
+        }
+            init_lookback_scan_state_kernel<offset_scan_state_type>
+                <<<dim3(grid_size), dim3(block_size), 0, stream>>>(
+                offset_scan_state, current_number_of_blocks, ordered_bid
+            );
+        ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_offset_scan_state_kernel", current_number_of_blocks, start)
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+        grid_size = current_number_of_blocks;
+            partition_kernel<
+                    SelectMethod, OnlySelected, config
+                >
+                <<<dim3(grid_size), dim3(block_size), 0, stream>>>(
+                keys_input + offset, values_input + offset, flags + offset, keys_output, values_output, selected_count, prev_selected_count,
+                current_size, inequality_op, offset_scan_state, current_number_of_blocks, ordered_bid, predicates...
+            );
+        ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("partition_kernel", size, start)
+        std::swap(selected_count, prev_selected_count);
+    }
+    error = ::rocprim::transform(
+        prev_selected_count, selected_count_output, (is_three_way ? 2 : 1), 
+        ::rocprim::identity<>{},
+        stream, debug_synchronous
+    );
+    if (error != cudaSuccess) return error;
+    return cudaSuccess;
+}
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+#undef ROCPRIM_DETAIL_HIP_SYNC
+} // end of detail namespace
+/// \brief Parallel select primitive for device level using range of flags.
+///
+/// Performs a device-wide partition based on input \p flags. Partition copies
+/// the values from \p input to \p output in such a way that all values for which the corresponding
+/// items from /p flags are \p true (or can be implicitly converted to \p true) precede
+/// the elements for which the corresponding items from /p flags are \p false.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input, \p flags and \p output must have at least \p size elements.
+/// * Range specified by \p selected_count_output must have at least 1 element.
+/// * Values of \p flag range should be implicitly convertible to `bool` type.
+/// * Relative order is preserved for the elements for which the corresponding values from \p flags
+/// are \p true. Other elements are copied in reverse order.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam FlagIterator - random-access iterator type of the flag range. It can be
+/// a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. It can be
+/// a simple pointer type.
+/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
+/// value. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the select operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to select values from.
+/// \param [in] flags - iterator to the selection flag corresponding to the first element from \p input range.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
+/// \param [in] size - number of element in the input range.
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level partition operation is performed on an array of
+/// integer values with array of <tt>char</tt>s used as flags.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;     // e.g., 8
+/// int * input;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// char * flags;          // e.g., [0, 1, 1, 0, 0, 1, 0, 1]
+/// int * output;          // empty array of 8 elements
+/// size_t * output_count; // empty array of 1 element
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::partition(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, flags,
+///     output, output_count,
+///     input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform partition
+/// rocprim::partition(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, flags,
+///     output, output_count,
+///     input_size
+/// );
+/// // output: [2, 3, 6, 8, 7, 5, 4, 1]
+/// // output_count: 4
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class FlagIterator,
+    class OutputIterator,
+    class SelectedCountOutputIterator
+>
+inline
+cudaError_t partition(void * temporary_storage,
+                     size_t& storage_size,
+                     InputIterator input,
+                     FlagIterator flags,
+                     OutputIterator output,
+                     SelectedCountOutputIterator selected_count_output,
+                     const size_t size,
+                     const cudaStream_t stream = 0,
+                     const bool debug_synchronous = false)
+{
+    // Dummy unary predicate
+    using unary_predicate_type = ::rocprim::empty_type;
+    // Dummy inequality operation
+    using inequality_op_type = ::rocprim::empty_type;
+    using offset_type = unsigned int;
+    rocprim::empty_type* const no_values = nullptr; // key only
+    return detail::partition_impl<detail::select_method::flag, false, Config, offset_type>(
+        temporary_storage, storage_size, input, no_values, flags, output, no_values, selected_count_output,
+        size, inequality_op_type(), stream, debug_synchronous, unary_predicate_type()
+    );
+}
+/// \brief Parallel select primitive for device level using selection predicate.
+///
+/// Performs a device-wide partition using selection predicate. Partition copies
+/// the values from \p input to \p output  in such a way that all values for which
+/// the \p predicate returns \p true precede the elements for which it returns \p false.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input, \p flags and \p output must have at least \p size elements.
+/// * Range specified by \p selected_count_output must have at least 1 element.
+/// * Relative order is preserved for the elements for which the \p predicate returns \p true. Other
+/// elements are copied in reverse order.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. It can be
+/// a simple pointer type.
+/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
+/// value. It can be a simple pointer type.
+/// \tparam UnaryPredicate - type of a unary selection predicate.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the select operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to select values from.
+/// \param [out] output - iterator to the first element in the output range.
+/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
+/// \param [in] size - number of element in the input range.
+/// \param [in] predicate - unary function object which returns /p true if the element should be
+/// ordered before other elements.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the object passed to it.
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level partition operation is performed on an array of
+/// integer values, even values are copied before odd values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>///
+///
+/// auto predicate =
+///     [] __device__ (int a) -> bool
+///     {
+///         return (a%2) == 0;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;     // e.g., 8
+/// int * input;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int * output;          // empty array of 8 elements
+/// size_t * output_count; // empty array of 1 element
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::partition(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input,
+///     output, output_count,
+///     input_size,
+///     predicate
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform partition
+/// rocprim::partition(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input,
+///     output, output_count,
+///     input_size,
+///     predicate
+/// );
+/// // output: [2, 4, 6, 8, 7, 5, 3, 1]
+/// // output_count: 4
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class SelectedCountOutputIterator,
+    class UnaryPredicate
+>
+inline
+cudaError_t partition(void * temporary_storage,
+                     size_t& storage_size,
+                     InputIterator input,
+                     OutputIterator output,
+                     SelectedCountOutputIterator selected_count_output,
+                     const size_t size,
+                     UnaryPredicate predicate,
+                     const cudaStream_t stream = 0,
+                     const bool debug_synchronous = false)
+{
+    // Dummy flag type
+    using flag_type = ::rocprim::empty_type;
+    flag_type * flags = nullptr;
+    // Dummy inequality operation
+    using inequality_op_type = ::rocprim::empty_type;
+    using offset_type = unsigned int;
+    rocprim::empty_type* const no_values = nullptr; // key only
+    return detail::partition_impl<detail::select_method::predicate, false, Config, offset_type>(
+        temporary_storage, storage_size, input, no_values, flags, output, no_values, selected_count_output,
+        size, inequality_op_type(), stream, debug_synchronous, predicate
+    );
+}
+/// \brief Parallel select primitive for device level using two selection predicates.
+///
+/// Performs a device-wide three-way partition using two selection predicates. Partition copies
+/// the values from \p input to either \p output_first_part or \p output_second_part or
+/// \p output_unselected according to the following criteria:
+/// The value is copied to \p output_first_part if the predicate \p select_first_part_op invoked
+/// with the value returns \p true. It is copied to \p output_second_part if \p select_first_part_op
+/// returns \p false and \p select_second_part_op returns \p true, and it is copied to
+/// \p output_unselected otherwise.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage is a null pointer.
+/// * Range specified by \p selected_count_output must have at least 2 elements.
+/// * Relative order is preserved for the elements.
+/// * The number of elements written to \p output_first_part is equal to the number of elements
+/// in the input for which \p select_first_part_op returned \p true.
+/// * The number of elements written to \p output_second_part is equal to the number of elements
+/// in the input for which \p select_first_part_op returned \p false and \p select_second_part_op
+/// returned \p true.
+/// * The number of elements written to \p output_unselected is equal to the number of input elements
+/// minus the number of elements written to \p output_first_part minus the number of elements written
+/// to \p output_second_part.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam FirstOutputIterator - random-access iterator type of the first output range. It can be
+/// a simple pointer type.
+/// \tparam SecondOutputIterator - random-access iterator type of the second output range. It can be
+/// a simple pointer type.
+/// \tparam UnselectedOutputIterator - random-access iterator type of the unselected output range.
+/// It can be a simple pointer type.
+/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
+/// value. It can be a simple pointer type.
+/// \tparam FirstUnaryPredicate - type of the first unary selection predicate.
+/// \tparam SecondUnaryPredicate - type of the second unary selection predicate.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the select operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to select values from.
+/// \param [out] output_first_part - iterator to the first element in the first output range.
+/// \param [out] output_second_part - iterator to the first element in the second output range.
+/// \param [out] output_unselected - iterator to the first element in the unselected output range.
+/// \param [out] selected_count_output - iterator to the total number of selected values in
+/// \p output_first_part and \p output_second_part respectively.
+/// \param [in] size - number of element in the input range.
+/// \param [in] select_first_part_op - unary function object which returns \p true if the element
+/// should be in \p output_first_part range
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the object passed to it.
+/// \param [in] select_second_part_op - unary function object which returns \p true if the element
+/// should be in \p output_second_part range (given that \p select_first_part_op returned \p false)
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the object passed to it.
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level three-way partition operation is performed on an array of
+/// integer values, even values are copied to the first partition, odd and 3-divisible values
+/// are copied to the second partition, and the rest of the values are copied to the
+/// unselected partition
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// auto first_predicate =
+///     [] __device__ (int a) -> bool
+///     {
+///         return (a%2) == 0;
+///     };
+/// auto second_predicate =
+///     [] __device__ (int a) -> bool
+///     {
+///         return (a%3) == 0;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;          // e.g., 8
+/// int * input;                // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int * output_first_part;    // array of 8 elements
+/// int * output_second_part;   // array of 8 elements
+/// int * output_unselected;    // array of 8 elements
+/// size_t * output_count;      // array of 2 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::partition_three_way(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input,
+///     output_first_part, output_second_part, output_unselected,
+///     output_count,
+///     input_size,
+///     first_predicate,
+///     second_predicate
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform partition
+/// rocprim::partition_three_way(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input,
+///     output_first_part, output_second_part, output_unselected,
+///     output_count,
+///     input_size,
+///     first_predicate,
+///     second_predicate
+/// );
+/// // elements denoted by '*' were not modified
+/// // output_first_part:  [2, 4, 6, 8, *, *, *, *]
+/// // output_second_part: [3, *, *, *, *, *, *, *]
+/// // output_unselected:  [1, 5, 7, *, *, *, *, *]
+/// // output_count:       [4, 1]
+/// \endcode
+/// \endparblock
+template <
+    class Config = default_config,
+    typename InputIterator,
+    typename FirstOutputIterator,
+    typename SecondOutputIterator,
+    typename UnselectedOutputIterator,
+    typename SelectedCountOutputIterator,
+    typename FirstUnaryPredicate,
+    typename SecondUnaryPredicate>
+inline
+cudaError_t partition_three_way(void * temporary_storage,
+                               size_t& storage_size,
+                               InputIterator input,
+                               FirstOutputIterator output_first_part,
+                               SecondOutputIterator output_second_part,
+                               UnselectedOutputIterator output_unselected,
+                               SelectedCountOutputIterator selected_count_output,
+                               const size_t size,
+                               FirstUnaryPredicate select_first_part_op,
+                               SecondUnaryPredicate select_second_part_op,
+                               const cudaStream_t stream = 0,
+                               const bool debug_synchronous = false)
+{
+    // Dummy flag type
+    using flag_type = ::rocprim::empty_type;
+    flag_type * flags = nullptr;
+    // Dummy inequality operation
+    using inequality_op_type = ::rocprim::empty_type;
+    using offset_type = uint2;
+    using output_key_iterator_tuple = tuple<
+        FirstOutputIterator,
+        SecondOutputIterator,
+        UnselectedOutputIterator>;
+    using output_value_iterator_tuple
+        = tuple<::rocprim::empty_type*, ::rocprim::empty_type*, ::rocprim::empty_type*>;
+    rocprim::empty_type* const no_input_values = nullptr; // key only
+    const output_value_iterator_tuple no_output_values {nullptr, nullptr, nullptr}; // key only
+    output_key_iterator_tuple output{ output_first_part, output_second_part, output_unselected };
+    return detail::partition_impl<detail::select_method::predicate, false, Config, offset_type>(
+        temporary_storage, storage_size, input, no_input_values, flags, output, no_output_values, selected_count_output,
+        size, inequality_op_type(), stream, debug_synchronous,
+        select_first_part_op, select_second_part_op
+    );
+}
+/// @}
+// end of group devicemodule
+END_ROCPRIM_NAMESPACE
+#endif // ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_
--- a/3rdparty/cub/rocprim/device/device_radix_sort.hpp
+++ b/3rdparty/cub/rocprim/device/device_radix_sort.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
+#define ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
+#include <iostream>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../detail/radix_sort.hpp"
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+#include "device_radix_sort_config.hpp"
+#include "device_transform.hpp"
+#include "detail/device_radix_sort.hpp"
+#include "specialization/device_radix_single_sort.hpp"
+#include "specialization/device_radix_merge_sort.hpp"
+/// \addtogroup devicemodule
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+namespace detail
+{
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    unsigned int RadixBits,
+    bool Descending,
+    class KeysInputIterator,
+    class Offset
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void fill_digit_counts_kernel(KeysInputIterator keys_input,
+                              Offset size,
+                              Offset * batch_digit_counts,
+                              unsigned int bit,
+                              unsigned int current_radix_bits,
+                              unsigned int blocks_per_full_batch,
+                              unsigned int full_batches)
+{
+    fill_digit_counts<BlockSize, ItemsPerThread, RadixBits, Descending>(
+        keys_input, size,
+        batch_digit_counts,
+        bit, current_radix_bits,
+        blocks_per_full_batch, full_batches
+    );
+}
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    unsigned int RadixBits,
+    class Offset
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void scan_batches_kernel(Offset * batch_digit_counts,
+                         Offset * digit_counts,
+                         unsigned int batches)
+{
+    scan_batches<BlockSize, ItemsPerThread, RadixBits>(batch_digit_counts, digit_counts, batches);
+}
+template<
+    unsigned int RadixBits,
+    class Offset
+>
+ROCPRIM_KERNEL
+__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
+void scan_digits_kernel(Offset * digit_counts)
+{
+    scan_digits<RadixBits>(digit_counts);
+}
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    unsigned int RadixBits,
+    bool Descending,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class Offset
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void sort_and_scatter_kernel(KeysInputIterator keys_input,
+                             KeysOutputIterator keys_output,
+                             ValuesInputIterator values_input,
+                             ValuesOutputIterator values_output,
+                             Offset size,
+                             const Offset * batch_digit_starts,
+                             const Offset * digit_starts,
+                             unsigned int bit,
+                             unsigned int current_radix_bits,
+                             unsigned int blocks_per_full_batch,
+                             unsigned int full_batches)
+{
+    sort_and_scatter<BlockSize, ItemsPerThread, RadixBits, Descending>(
+        keys_input, keys_output, values_input, values_output, size,
+        batch_digit_starts, digit_starts,
+        bit, current_radix_bits,
+        blocks_per_full_batch, full_batches
+    );
+}
+#ifndef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+#endif
+template<
+    class Config,
+    unsigned int RadixBits,
+    bool Descending,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class Offset
+>
+inline
+cudaError_t radix_sort_iteration(KeysInputIterator keys_input,
+                                typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
+                                KeysOutputIterator keys_output,
+                                ValuesInputIterator values_input,
+                                typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
+                                ValuesOutputIterator values_output,
+                                Offset size,
+                                Offset * batch_digit_counts,
+                                Offset * digit_counts,
+                                bool from_input,
+                                bool to_output,
+                                unsigned int bit,
+                                unsigned int end_bit,
+                                unsigned int blocks_per_full_batch,
+                                unsigned int full_batches,
+                                unsigned int batches,
+                                cudaStream_t stream,
+                                bool debug_synchronous)
+{
+    constexpr unsigned int radix_size = 1 << RadixBits;
+    // Handle cases when (end_bit - bit) is not divisible by RadixBits, i.e. the last
+    // iteration has a shorter mask.
+    const unsigned int current_radix_bits = ::rocprim::min(RadixBits, end_bit - bit);
+    std::chrono::high_resolution_clock::time_point start;
+    if(debug_synchronous)
+    {
+        std::cout << "RadixBits " << RadixBits << '\n';
+        std::cout << "bit " << bit << '\n';
+        std::cout << "current_radix_bits " << current_radix_bits << '\n';
+    }
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    if(from_input)
+    {
+        fill_digit_counts_kernel<
+                Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
+            >
+            <<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
+            keys_input, size,
+            batch_digit_counts,
+            bit, current_radix_bits,
+            blocks_per_full_batch, full_batches
+        );
+    }
+    else
+    {
+        if(to_output)
+        {
+            fill_digit_counts_kernel<
+                    Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
+                >
+                <<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
+                keys_tmp, size,
+                batch_digit_counts,
+                bit, current_radix_bits,
+                blocks_per_full_batch, full_batches
+            );
+        }
+        else
+        {
+            fill_digit_counts_kernel<
+                    Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
+                >
+                <<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
+                keys_output, size,
+                batch_digit_counts,
+                bit, current_radix_bits,
+                blocks_per_full_batch, full_batches
+            );
+        }
+    }
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("fill_digit_counts", size, start)
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    scan_batches_kernel<Config::scan::block_size, Config::scan::items_per_thread, RadixBits>
+        <<<dim3(radix_size), dim3(Config::scan::block_size), 0, stream>>>(
+        batch_digit_counts, digit_counts, batches
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_batches", radix_size * Config::scan::block_size, start)
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    scan_digits_kernel<RadixBits>
+        <<<dim3(1), dim3(radix_size), 0, stream>>>(
+        digit_counts
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_digits", radix_size, start)
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    if(from_input)
+    {
+        if(to_output)
+        {
+            sort_and_scatter_kernel<
+                    Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
+                >
+                <<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
+                keys_input, keys_output, values_input, values_output, size,
+                const_cast<const Offset *>(batch_digit_counts),
+                const_cast<const Offset *>(digit_counts),
+                bit, current_radix_bits,
+                blocks_per_full_batch, full_batches
+            );
+        }
+        else
+        {
+            sort_and_scatter_kernel<
+                    Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
+                >
+                <<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
+                keys_input, keys_tmp, values_input, values_tmp, size,
+                const_cast<const Offset *>(batch_digit_counts),
+                const_cast<const Offset *>(digit_counts),
+                bit, current_radix_bits,
+                blocks_per_full_batch, full_batches
+            );
+        }
+    }
+    else
+    {
+        if(to_output)
+        {
+            sort_and_scatter_kernel<
+                    Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
+                >
+                <<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
+                keys_tmp, keys_output, values_tmp, values_output, size,
+                const_cast<const Offset *>(batch_digit_counts),
+                const_cast<const Offset *>(digit_counts),
+                bit, current_radix_bits,
+                blocks_per_full_batch, full_batches
+            );
+        }
+        else
+        {
+            sort_and_scatter_kernel<
+                    Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
+                >
+                <<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
+                keys_output, keys_tmp, values_output, values_tmp, size,
+                const_cast<const Offset *>(batch_digit_counts),
+                const_cast<const Offset *>(digit_counts),
+                bit, current_radix_bits,
+                blocks_per_full_batch, full_batches
+            );
+        }
+    }
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("sort_and_scatter", size, start)
+    return cudaSuccess;
+}
+template<
+    class Config,
+    bool Descending,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator
+>
+inline
+cudaError_t radix_sort_single_impl(void * temporary_storage,
+                                 size_t& storage_size,
+                                 KeysInputIterator keys_input,
+                                 KeysOutputIterator keys_output,
+                                 ValuesInputIterator values_input,
+                                 ValuesOutputIterator values_output,
+                                 unsigned int size,
+                                 bool& is_result_in_output,
+                                 unsigned int begin_bit,
+                                 unsigned int end_bit,
+                                 cudaStream_t stream,
+                                 bool debug_synchronous)
+{
+    using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+    using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
+    using config = default_or_custom_config<
+        Config,
+        default_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
+    >;
+    const size_t minimum_bytes = ::rocprim::detail::align_size(1);
+    if(temporary_storage == nullptr)
+    {
+        storage_size = minimum_bytes;
+        return cudaSuccess;
+    }
+    if( size == 0u )
+        return cudaSuccess;
+    if(debug_synchronous)
+    {
+        std::cout << "temporary_storage " << temporary_storage << '\n';
+        cudaError_t error = cudaStreamSynchronize(stream);
+        if(error != cudaSuccess) return error;
+    }
+    cudaError_t error = radix_sort_single<config, Descending>(
+        keys_input, keys_output, values_input, values_output, size,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(error != cudaSuccess) return error;
+    is_result_in_output = true;
+    return cudaSuccess;
+}
+template<
+    class Config,
+    bool Descending,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator
+>
+inline
+cudaError_t radix_sort_merge_impl(void * temporary_storage,
+                                 size_t& storage_size,
+                                 KeysInputIterator keys_input,
+                                 typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
+                                 KeysOutputIterator keys_output,
+                                 ValuesInputIterator values_input,
+                                 typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
+                                 ValuesOutputIterator values_output,
+                                 unsigned int size,
+                                 bool& is_result_in_output,
+                                 unsigned int begin_bit,
+                                 unsigned int end_bit,
+                                 cudaStream_t stream,
+                                 bool debug_synchronous)
+{
+    using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+    using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
+    using config = default_or_custom_config<
+        Config,
+        default_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
+    >;
+    constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
+    const bool with_double_buffer = keys_tmp != nullptr;
+    const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type));
+    const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0;
+    const size_t minimum_bytes = ::rocprim::detail::align_size(1);
+    if(temporary_storage == nullptr)
+    {
+        if(!with_double_buffer)
+            storage_size = keys_bytes + values_bytes;
+        else
+            storage_size = minimum_bytes;
+        return cudaSuccess;
+    }
+    if(debug_synchronous)
+    {
+        std::cout << "temporary_storage " << temporary_storage << '\n';
+        cudaError_t error = cudaStreamSynchronize(stream);
+        if(error != cudaSuccess) return error;
+    }
+    if(!with_double_buffer)
+    {
+        char * ptr = reinterpret_cast<char *>(temporary_storage);
+        keys_tmp = reinterpret_cast<key_type *>(ptr);
+        ptr += keys_bytes;
+        values_tmp = with_values ? reinterpret_cast<value_type *>(ptr) : nullptr;
+    }
+    cudaError_t error = radix_sort_merge<config, Descending>(
+        keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output, size,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(error != cudaSuccess) return error;
+    is_result_in_output = true;
+    return cudaSuccess;
+}
+template<class Size>
+using offset_type_t = std::conditional_t<
+    sizeof(Size) <= 4,
+    unsigned int,
+    size_t
+>;
+template<
+    class Config,
+    bool Descending,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class Size
+>
+inline
+cudaError_t radix_sort_iterations_impl(void * temporary_storage,
+                                      size_t& storage_size,
+                                      KeysInputIterator keys_input,
+                                      typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
+                                      KeysOutputIterator keys_output,
+                                      ValuesInputIterator values_input,
+                                      typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
+                                      ValuesOutputIterator values_output,
+                                      Size size,
+                                      bool& is_result_in_output,
+                                      unsigned int begin_bit,
+                                      unsigned int end_bit,
+                                      cudaStream_t stream,
+                                      bool debug_synchronous)
+{
+    using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+    using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
+    using offset_type = offset_type_t<Size>;
+    using config = default_or_custom_config<
+        Config,
+        default_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
+    >;
+    constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
+    constexpr unsigned int max_radix_size = 1 << config::long_radix_bits;
+    constexpr unsigned int scan_size = config::scan::block_size * config::scan::items_per_thread;
+    constexpr unsigned int sort_size = config::sort::block_size * config::sort::items_per_thread;
+    const unsigned int blocks = static_cast<unsigned int>(::rocprim::detail::ceiling_div(size, sort_size));
+    const unsigned int blocks_per_full_batch = ::rocprim::detail::ceiling_div(blocks, scan_size);
+    const unsigned int full_batches = blocks % scan_size != 0
+        ? blocks % scan_size
+        : scan_size;
+    const unsigned int batches = (blocks_per_full_batch == 1 ? full_batches : scan_size);
+    const bool with_double_buffer = keys_tmp != nullptr;
+    const unsigned int bits = end_bit - begin_bit;
+    const unsigned int iterations = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits);
+    const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits;
+    const unsigned int short_iterations = radix_bits_diff != 0
+        ? ::rocprim::min(iterations, (config::long_radix_bits * iterations - bits) / std::max(1u, radix_bits_diff))
+        : 0;
+    const unsigned int long_iterations = iterations - short_iterations;
+    const size_t batch_digit_counts_bytes =
+        ::rocprim::detail::align_size(batches * max_radix_size * sizeof(offset_type));
+    const size_t digit_counts_bytes = ::rocprim::detail::align_size(max_radix_size * sizeof(offset_type));
+    const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type));
+    const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0;
+    if(temporary_storage == nullptr)
+    {
+        storage_size = batch_digit_counts_bytes + digit_counts_bytes;
+        if(!with_double_buffer)
+        {
+            storage_size += keys_bytes + values_bytes;
+        }
+        return cudaSuccess;
+    }
+    if( size == 0u )
+        return cudaSuccess;
+    if(debug_synchronous)
+    {
+        std::cout << "scan_size " << scan_size << '\n';
+        std::cout << "sort_size " << sort_size << '\n';
+        std::cout << "blocks " << blocks << '\n';
+        std::cout << "blocks_per_full_batch " << blocks_per_full_batch << '\n';
+        std::cout << "full_batches " << full_batches << '\n';
+        std::cout << "batches " << batches << '\n';
+        std::cout << "iterations " << iterations << '\n';
+        std::cout << "long_iterations " << long_iterations << '\n';
+        std::cout << "short_iterations " << short_iterations << '\n';
+        cudaError_t error = cudaStreamSynchronize(stream);
+        if(error != cudaSuccess) return error;
+    }
+    char * ptr = reinterpret_cast<char *>(temporary_storage);
+    offset_type * batch_digit_counts = reinterpret_cast<offset_type *>(ptr);
+    ptr += batch_digit_counts_bytes;
+    offset_type * digit_counts = reinterpret_cast<offset_type *>(ptr);
+    ptr += digit_counts_bytes;
+    if(!with_double_buffer)
+    {
+        keys_tmp = reinterpret_cast<key_type *>(ptr);
+        ptr += keys_bytes;
+        values_tmp = with_values ? reinterpret_cast<value_type *>(ptr) : nullptr;
+    }
+    bool to_output = with_double_buffer || (iterations - 1) % 2 == 0;
+    bool from_input = true;
+    if(!with_double_buffer && to_output)
+    {
+        // Copy input keys and values if necessary (in-place sorting: input and output iterators are equal)
+        const bool keys_equal = ::rocprim::detail::are_iterators_equal(keys_input, keys_output);
+        const bool values_equal = with_values && ::rocprim::detail::are_iterators_equal(values_input, values_output);
+        if(keys_equal || values_equal)
+        {
+            cudaError_t error = ::rocprim::transform(
+                keys_input, keys_tmp, size,
+                ::rocprim::identity<key_type>(), stream, debug_synchronous
+            );
+            if(error != cudaSuccess) return error;
+            if(with_values)
+            {
+                cudaError_t error = ::rocprim::transform(
+                    values_input, values_tmp, size,
+                    ::rocprim::identity<value_type>(), stream, debug_synchronous
+                );
+                if(error != cudaSuccess) return error;
+            }
+            from_input = false;
+        }
+    }
+    unsigned int bit = begin_bit;
+    for(unsigned int i = 0; i < long_iterations; i++)
+    {
+        cudaError_t error = radix_sort_iteration<config, config::long_radix_bits, Descending>(
+            keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
+            static_cast<offset_type>(size), batch_digit_counts, digit_counts,
+            from_input, to_output,
+            bit, end_bit,
+            blocks_per_full_batch, full_batches, batches,
+            stream, debug_synchronous
+        );
+        if(error != cudaSuccess) return error;
+        is_result_in_output = to_output;
+        from_input = false;
+        to_output = !to_output;
+        bit += config::long_radix_bits;
+    }
+    for(unsigned int i = 0; i < short_iterations; i++)
+    {
+        cudaError_t error = radix_sort_iteration<config, config::short_radix_bits, Descending>(
+            keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
+            static_cast<offset_type>(size), batch_digit_counts, digit_counts,
+            from_input, to_output,
+            bit, end_bit,
+            blocks_per_full_batch, full_batches, batches,
+            stream, debug_synchronous
+        );
+        if(error != cudaSuccess) return error;
+        is_result_in_output = to_output;
+        from_input = false;
+        to_output = !to_output;
+        bit += config::short_radix_bits;
+    }
+    return cudaSuccess;
+}
+template<
+    class Config,
+    bool Descending,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class Size
+>
+inline
+cudaError_t radix_sort_impl(void * temporary_storage,
+                           size_t& storage_size,
+                           KeysInputIterator keys_input,
+                           typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
+                           ValuesOutputIterator values_output,
+                           Size size,
+                           bool& is_result_in_output,
+                           unsigned int begin_bit,
+                           unsigned int end_bit,
+                           cudaStream_t stream,
+                           bool debug_synchronous)
+{
+    using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+    using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
+    static_assert(
+        std::is_same<key_type, typename std::iterator_traits<KeysOutputIterator>::value_type>::value,
+        "KeysInputIterator and KeysOutputIterator must have the same value_type"
+    );
+    static_assert(
+        std::is_same<value_type, typename std::iterator_traits<ValuesOutputIterator>::value_type>::value,
+        "ValuesInputIterator and ValuesOutputIterator must have the same value_type"
+    );
+    using config = default_or_custom_config<
+        Config,
+        default_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
+    >;
+    constexpr unsigned int single_sort_limit = config::sort_single::block_size * config::sort_single::items_per_thread;
+    constexpr unsigned int merge_sort_limit = config::sort_merge::block_size * config::sort_merge::items_per_thread * config::merge_size_limit_blocks;
+    if( size <= single_sort_limit )
+    {
+        return radix_sort_single_impl<Config, Descending>(
+            temporary_storage,
+            storage_size,
+            keys_input,
+            keys_output,
+            values_input,
+            values_output,
+            static_cast<unsigned int>(size),
+            is_result_in_output,
+            begin_bit,
+            end_bit,
+            stream,
+            debug_synchronous
+        );
+    }
+    else if( size <= merge_sort_limit )
+    {
+        return radix_sort_merge_impl<Config, Descending>(
+            temporary_storage,
+            storage_size,
+            keys_input,
+            keys_tmp,
+            keys_output,
+            values_input,
+            values_tmp,
+            values_output,
+            static_cast<unsigned int>(size),
+            is_result_in_output,
+            begin_bit,
+            end_bit,
+            stream,
+            debug_synchronous
+        );
+    }
+    else
+    {
+        return radix_sort_iterations_impl<Config, Descending>(
+            temporary_storage,
+            storage_size,
+            keys_input,
+            keys_tmp,
+            keys_output,
+            values_input,
+            values_tmp,
+            values_output,
+            size,
+            is_result_in_output,
+            begin_bit,
+            end_bit,
+            stream,
+            debug_synchronous
+        );
+    }
+}
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+} // end namespace detail
+/// \brief Parallel ascending radix sort primitive for device level.
+///
+/// \p radix_sort_keys function performs a device-wide radix sort
+/// of keys. Function sorts input keys in ascending order.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
+/// an arithmetic type (that is, an integral type or a floating-point type).
+/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
+/// a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam Size - integral type that represents the problem size.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending radix sort is performed on an array of
+/// \p float values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;      // e.g., 8
+/// float * input;          // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
+/// float * output;         // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::radix_sort_keys(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::radix_sort_keys(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size
+/// );
+/// // keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class Size,
+    class Key = typename std::iterator_traits<KeysInputIterator>::value_type
+>
+inline
+cudaError_t radix_sort_keys(void * temporary_storage,
+                           size_t& storage_size,
+                           KeysInputIterator keys_input,
+                           KeysOutputIterator keys_output,
+                           Size size,
+                           unsigned int begin_bit = 0,
+                           unsigned int end_bit = 8 * sizeof(Key),
+                           cudaStream_t stream = 0,
+                           bool debug_synchronous = false)
+{
+    static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
+    empty_type * values = nullptr;
+    bool ignored;
+    return detail::radix_sort_impl<Config, false>(
+        temporary_storage, storage_size,
+        keys_input, nullptr, keys_output,
+        values, nullptr, values,
+        size, ignored,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel descending radix sort primitive for device level.
+///
+/// \p radix_sort_keys_desc function performs a device-wide radix sort
+/// of keys. Function sorts input keys in descending order.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
+/// an arithmetic type (that is, an integral type or a floating-point type).
+/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
+/// a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam Size - integral type that represents the problem size.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level descending radix sort is performed on an array of
+/// integer values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;    // e.g., 8
+/// int * input;          // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
+/// int * output;         // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::radix_sort_keys_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::radix_sort_keys_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size
+/// );
+/// // keys_output: [8, 7, 6, 5, 4, 3, 2, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class Size,
+    class Key = typename std::iterator_traits<KeysInputIterator>::value_type
+>
+inline
+cudaError_t radix_sort_keys_desc(void * temporary_storage,
+                                size_t& storage_size,
+                                KeysInputIterator keys_input,
+                                KeysOutputIterator keys_output,
+                                Size size,
+                                unsigned int begin_bit = 0,
+                                unsigned int end_bit = 8 * sizeof(Key),
+                                cudaStream_t stream = 0,
+                                bool debug_synchronous = false)
+{
+    static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
+    empty_type * values = nullptr;
+    bool ignored;
+    return detail::radix_sort_impl<Config, true>(
+        temporary_storage, storage_size,
+        keys_input, nullptr, keys_output,
+        values, nullptr, values,
+        size, ignored,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel ascending radix sort-by-key primitive for device level.
+///
+/// \p radix_sort_pairs_desc function performs a device-wide radix sort
+/// of (key, value) pairs. Function sorts input pairs in ascending order of keys.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
+/// an arithmetic type (that is, an integral type or a floating-point type).
+/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
+/// have at least \p size elements.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
+/// a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam Size - integral type that represents the problem size.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] values_input - pointer to the first element in the range to sort.
+/// \param [out] values_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending radix sort is performed where input keys are
+/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;          // e.g., 8
+/// unsigned int * keys_input;  // e.g., [ 6, 3,  5, 4,  1,  8,  1, 7]
+/// double * values_input;      // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
+/// unsigned int * keys_output; // empty array of 8 elements
+/// double * values_output;     // empty array of 8 elements
+///
+/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
+/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
+/// // is set to 5.
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::radix_sort_pairs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output,
+///     input_size, 0, 5
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::radix_sort_pairs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output,
+///     input_size, 0, 5
+/// );
+/// // keys_output:   [ 1,  1, 3, 4,  5,  6, 7,  8]
+/// // values_output: [-1, -2, 2, 3, -4, -5, 7, -8]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class Size,
+    class Key = typename std::iterator_traits<KeysInputIterator>::value_type
+>
+inline
+cudaError_t radix_sort_pairs(void * temporary_storage,
+                            size_t& storage_size,
+                            KeysInputIterator keys_input,
+                            KeysOutputIterator keys_output,
+                            ValuesInputIterator values_input,
+                            ValuesOutputIterator values_output,
+                            Size size,
+                            unsigned int begin_bit = 0,
+                            unsigned int end_bit = 8 * sizeof(Key),
+                            cudaStream_t stream = 0,
+                            bool debug_synchronous = false)
+{
+    static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
+    bool ignored;
+    return detail::radix_sort_impl<Config, false>(
+        temporary_storage, storage_size,
+        keys_input, nullptr, keys_output,
+        values_input, nullptr, values_output,
+        size, ignored,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel descending radix sort-by-key primitive for device level.
+///
+/// \p radix_sort_pairs_desc function performs a device-wide radix sort
+/// of (key, value) pairs. Function sorts input pairs in descending order of keys.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
+/// an arithmetic type (that is, an integral type or a floating-point type).
+/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
+/// have at least \p size elements.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
+/// a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam Size - integral type that represents the problem size.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] values_input - pointer to the first element in the range to sort.
+/// \param [out] values_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level descending radix sort is performed where input keys are
+/// represented by an array of integers and input values by an array of <tt>double</tt>s.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;       // e.g., 8
+/// int * keys_input;        // e.g., [ 6, 3,  5, 4,  1,  8,  1, 7]
+/// double * values_input;   // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
+/// int * keys_output;       // empty array of 8 elements
+/// double * values_output;  // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::radix_sort_pairs_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output,
+///     input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::radix_sort_pairs_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output,
+///     input_size
+/// );
+/// // keys_output:   [ 8, 7,  6,  5, 4, 3,  1,  1]
+/// // values_output: [-8, 7, -5, -4, 3, 2, -1, -2]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class Size,
+    class Key = typename std::iterator_traits<KeysInputIterator>::value_type
+>
+inline
+cudaError_t radix_sort_pairs_desc(void * temporary_storage,
+                                 size_t& storage_size,
+                                 KeysInputIterator keys_input,
+                                 KeysOutputIterator keys_output,
+                                 ValuesInputIterator values_input,
+                                 ValuesOutputIterator values_output,
+                                 Size size,
+                                 unsigned int begin_bit = 0,
+                                 unsigned int end_bit = 8 * sizeof(Key),
+                                 cudaStream_t stream = 0,
+                                 bool debug_synchronous = false)
+{
+    static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
+    bool ignored;
+    return detail::radix_sort_impl<Config, true>(
+        temporary_storage, storage_size,
+        keys_input, nullptr, keys_output,
+        values_input, nullptr, values_output,
+        size, ignored,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel ascending radix sort primitive for device level.
+///
+/// \p radix_sort_keys function performs a device-wide radix sort
+/// of keys. Function sorts input keys in ascending order.
+///
+/// \par Overview
+/// * The contents of both buffers of \p keys may be altered by the sorting function.
+/// * \p current() of \p keys is used as the input.
+/// * The function will update \p current() of \p keys to point to the buffer
+/// that contains the output range.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * The function requires small \p temporary_storage as it does not need
+/// a temporary buffer of \p size elements.
+/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
+/// type).
+/// * Buffers of \p keys must have at least \p size elements.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
+/// a custom class with the same members.
+/// \tparam Key - key type. Must be an integral type or a floating-point type.
+/// \tparam Size - integral type that represents the problem size.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending radix sort is performed on an array of
+/// \p float values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
+/// size_t input_size;  // e.g., 8
+/// float * input;      // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
+/// float * tmp;        // empty array of 8 elements
+/// // Create double-buffer
+/// rocprim::double_buffer<float> keys(input, tmp);
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::radix_sort_keys(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::radix_sort_keys(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, input_size
+/// );
+/// // keys.current(): [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class Key,
+    class Size
+>
+inline
+cudaError_t radix_sort_keys(void * temporary_storage,
+                           size_t& storage_size,
+                           double_buffer<Key>& keys,
+                           Size size,
+                           unsigned int begin_bit = 0,
+                           unsigned int end_bit = 8 * sizeof(Key),
+                           cudaStream_t stream = 0,
+                           bool debug_synchronous = false)
+{
+    static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
+    empty_type * values = nullptr;
+    bool is_result_in_output;
+    cudaError_t error = detail::radix_sort_impl<Config, false>(
+        temporary_storage, storage_size,
+        keys.current(), keys.current(), keys.alternate(),
+        values, values, values,
+        size, is_result_in_output,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(temporary_storage != nullptr && is_result_in_output)
+    {
+        keys.swap();
+    }
+    return error;
+}
+/// \brief Parallel descending radix sort primitive for device level.
+///
+/// \p radix_sort_keys_desc function performs a device-wide radix sort
+/// of keys. Function sorts input keys in descending order.
+///
+/// \par Overview
+/// * The contents of both buffers of \p keys may be altered by the sorting function.
+/// * \p current() of \p keys is used as the input.
+/// * The function will update \p current() of \p keys to point to the buffer
+/// that contains the output range.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * The function requires small \p temporary_storage as it does not need
+/// a temporary buffer of \p size elements.
+/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
+/// type).
+/// * Buffers of \p keys must have at least \p size elements.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
+/// a custom class with the same members.
+/// \tparam Key - key type. Must be an integral type or a floating-point type.
+/// \tparam Size - integral type that represents the problem size.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level descending radix sort is performed on an array of
+/// integer values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
+/// size_t input_size;  // e.g., 8
+/// int * input;        // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
+/// int * tmp;          // empty array of 8 elements
+/// // Create double-buffer
+/// rocprim::double_buffer<int> keys(input, tmp);
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::radix_sort_keys_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::radix_sort_keys_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, input_size
+/// );
+/// // keys.current(): [8, 7, 6, 5, 4, 3, 2, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class Key,
+    class Size
+>
+inline
+cudaError_t radix_sort_keys_desc(void * temporary_storage,
+                                size_t& storage_size,
+                                double_buffer<Key>& keys,
+                                Size size,
+                                unsigned int begin_bit = 0,
+                                unsigned int end_bit = 8 * sizeof(Key),
+                                cudaStream_t stream = 0,
+                                bool debug_synchronous = false)
+{
+    static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
+    empty_type * values = nullptr;
+    bool is_result_in_output;
+    cudaError_t error = detail::radix_sort_impl<Config, true>(
+        temporary_storage, storage_size,
+        keys.current(), keys.current(), keys.alternate(),
+        values, values, values,
+        size, is_result_in_output,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(temporary_storage != nullptr && is_result_in_output)
+    {
+        keys.swap();
+    }
+    return error;
+}
+/// \brief Parallel ascending radix sort-by-key primitive for device level.
+///
+/// \p radix_sort_pairs_desc function performs a device-wide radix sort
+/// of (key, value) pairs. Function sorts input pairs in ascending order of keys.
+///
+/// \par Overview
+/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
+/// * \p current() of \p keys and \p values are used as the input.
+/// * The function will update \p current() of \p keys and \p values to point to buffers
+/// that contains the output range.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * The function requires small \p temporary_storage as it does not need
+/// a temporary buffer of \p size elements.
+/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
+/// type).
+/// * Buffers of \p keys must have at least \p size elements.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
+/// a custom class with the same members.
+/// \tparam Key - key type. Must be an integral type or a floating-point type.
+/// \tparam Value - value type.
+/// \tparam Size - integral type that represents the problem size.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in,out] values - reference to the double-buffer of values, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending radix sort is performed where input keys are
+/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
+/// size_t input_size;          // e.g., 8
+/// unsigned int * keys_input;  // e.g., [ 6, 3,  5, 4,  1,  8,  1, 7]
+/// double * values_input;      // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
+/// unsigned int * keys_tmp;    // empty array of 8 elements
+/// double*  values_tmp;        // empty array of 8 elements
+/// // Create double-buffers
+/// rocprim::double_buffer<unsigned int> keys(keys_input, keys_tmp);
+/// rocprim::double_buffer<double> values(values_input, values_tmp);
+///
+/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
+/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
+/// // is set to 5.
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::radix_sort_pairs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, values, input_size,
+///     0, 5
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::radix_sort_pairs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, values, input_size,
+///     0, 5
+/// );
+/// // keys.current():   [ 1,  1, 3, 4,  5,  6, 7,  8]
+/// // values.current(): [-1, -2, 2, 3, -4, -5, 7, -8]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class Key,
+    class Value,
+    class Size
+>
+inline
+cudaError_t radix_sort_pairs(void * temporary_storage,
+                            size_t& storage_size,
+                            double_buffer<Key>& keys,
+                            double_buffer<Value>& values,
+                            Size size,
+                            unsigned int begin_bit = 0,
+                            unsigned int end_bit = 8 * sizeof(Key),
+                            cudaStream_t stream = 0,
+                            bool debug_synchronous = false)
+{
+    static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
+    bool is_result_in_output;
+    cudaError_t error = detail::radix_sort_impl<Config, false>(
+        temporary_storage, storage_size,
+        keys.current(), keys.current(), keys.alternate(),
+        values.current(), values.current(), values.alternate(),
+        size, is_result_in_output,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(temporary_storage != nullptr && is_result_in_output)
+    {
+        keys.swap();
+        values.swap();
+    }
+    return error;
+}
+/// \brief Parallel descending radix sort-by-key primitive for device level.
+///
+/// \p radix_sort_pairs_desc function performs a device-wide radix sort
+/// of (key, value) pairs. Function sorts input pairs in descending order of keys.
+///
+/// \par Overview
+/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
+/// * \p current() of \p keys and \p values are used as the input.
+/// * The function will update \p current() of \p keys and \p values to point to buffers
+/// that contains the output range.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * The function requires small \p temporary_storage as it does not need
+/// a temporary buffer of \p size elements.
+/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
+/// type).
+/// * Buffers of \p keys must have at least \p size elements.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
+/// a custom class with the same members.
+/// \tparam Key - key type. Must be an integral type or a floating-point type.
+/// \tparam Value - value type.
+/// \tparam Size - integral type that represents the problem size.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in,out] values - reference to the double-buffer of values, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level descending radix sort is performed where input keys are
+/// represented by an array of integers and input values by an array of <tt>double</tt>s.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
+/// size_t input_size;       // e.g., 8
+/// int * keys_input;        // e.g., [ 6, 3,  5, 4,  1,  8,  1, 7]
+/// double * values_input;   // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
+/// int * keys_tmp;          // empty array of 8 elements
+/// double * values_tmp;     // empty array of 8 elements
+/// // Create double-buffers
+/// rocprim::double_buffer<int> keys(keys_input, keys_tmp);
+/// rocprim::double_buffer<double> values(values_input, values_tmp);
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::radix_sort_pairs_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, values, input_size
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::radix_sort_pairs_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, values, input_size
+/// );
+/// // keys.current():   [ 8, 7,  6,  5, 4, 3,  1,  1]
+/// // values.current(): [-8, 7, -5, -4, 3, 2, -1, -2]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class Key,
+    class Value,
+    class Size
+>
+inline
+cudaError_t radix_sort_pairs_desc(void * temporary_storage,
+                                 size_t& storage_size,
+                                 double_buffer<Key>& keys,
+                                 double_buffer<Value>& values,
+                                 Size size,
+                                 unsigned int begin_bit = 0,
+                                 unsigned int end_bit = 8 * sizeof(Key),
+                                 cudaStream_t stream = 0,
+                                 bool debug_synchronous = false)
+{
+    static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
+    bool is_result_in_output;
+    cudaError_t error = detail::radix_sort_impl<Config, true>(
+        temporary_storage, storage_size,
+        keys.current(), keys.current(), keys.alternate(),
+        values.current(), values.current(), values.alternate(),
+        size, is_result_in_output,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(temporary_storage != nullptr && is_result_in_output)
+    {
+        keys.swap();
+        values.swap();
+    }
+    return error;
+}
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group devicemodule
+#endif // ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
--- a/3rdparty/cub/rocprim/device/device_radix_sort_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_radix_sort_config.hpp
+// Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_RADIX_SORT_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_RADIX_SORT_CONFIG_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "config_types.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief Configuration of device-level radix sort operation.
+///
+/// Radix sort is excecuted in a single tile (at size < BlocksPerItem) or
+/// few iterations (passes) depending on total number of bits to be sorted
+/// (\p begin_bit and \p end_bit), each iteration sorts either \p LongRadixBits or \p ShortRadixBits bits
+/// choosen to cover whole bit range in optimal way.
+///
+/// For example, if \p LongRadixBits is 7, \p ShortRadixBits is 6, \p begin_bit is 0 and \p end_bit is 32
+/// there will be 5 iterations: 7 + 7 + 6 + 6 + 6 = 32 bits.
+///
+/// \tparam LongRadixBits - number of bits in long iterations.
+/// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits.
+/// \tparam ScanConfig - configuration of digits scan kernel. Must be \p kernel_config.
+/// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config.
+template<
+    unsigned int LongRadixBits,
+    unsigned int ShortRadixBits,
+    class ScanConfig,
+    class SortConfig,
+    class SortSingleConfig = kernel_config<256, 10>,
+    class SortMergeConfig = kernel_config<1024, 1>,
+    unsigned int MergeSizeLimitBlocks = 1024U,
+    bool ForceSingleKernelConfig = false
+>
+struct radix_sort_config
+{
+    /// \brief Number of bits in long iterations.
+    static constexpr unsigned int long_radix_bits = LongRadixBits;
+    /// \brief Number of bits in short iterations.
+    static constexpr unsigned int short_radix_bits = ShortRadixBits;
+    /// \brief Limit number of blocks to use merge kernel.
+    static constexpr unsigned int merge_size_limit_blocks = MergeSizeLimitBlocks;
+    /// \brief Configuration of digits scan kernel.
+    using scan = ScanConfig;
+    /// \brief Configuration of radix sort kernel.
+    using sort = SortConfig;
+    /// \brief Configuration of radix sort single kernel.
+    using sort_single = SortSingleConfig;
+    /// \brief Configuration of radix sort merge kernel.
+    using sort_merge = SortMergeConfig;
+    /// \brief Force use radix sort single kernel configuration.
+    static constexpr bool force_single_kernel_config = ForceSingleKernelConfig;
+};
+namespace detail
+{
+template<class Key, class Value>
+struct radix_sort_config_803
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    using scan = kernel_config<256, 2>;
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            radix_sort_config<
+                8, 7, scan,
+                kernel_config<256, 10>, kernel_config<256, 19>
+            >
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            radix_sort_config<
+                8, 7, scan,
+                kernel_config<256, 10>, kernel_config<256, 17>
+            >
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            radix_sort_config<
+                7, 6, scan,
+                kernel_config<256, 15>, kernel_config<256, 13>
+            >
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            radix_sort_config<
+                7, 6, scan,
+                kernel_config<256, 13>, kernel_config<256, 10>
+            >
+        >,
+        radix_sort_config<
+            6, 4, scan,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 15u / item_scale)
+            >,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 10u / item_scale)
+            >,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 10u / item_scale)
+            >
+        >
+    >;
+};
+template<class Key>
+struct radix_sort_config_803<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, radix_sort_config<8, 7, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
+        select_type_case<sizeof(Key) == 2, radix_sort_config<8, 7, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 16> > >,
+        select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 9>, kernel_config<256, 15> > >,
+        select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 7>, kernel_config<256, 12> > >
+    > { };
+template<class Key, class Value>
+struct radix_sort_config_900
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    using scan = kernel_config<256, 2>;
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            radix_sort_config<4, 4, scan,
+            kernel_config<256, 10>, kernel_config<256, 19> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            radix_sort_config<6, 5, scan,
+            kernel_config<256, 10>, kernel_config<256, 17> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, scan,
+            kernel_config<256, 15>, kernel_config<256, 15> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, scan,
+            kernel_config<256, 15>, kernel_config<256, 12> >
+        >,
+        radix_sort_config<
+            6, 4, scan,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 15u / item_scale)
+            >,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 10u / item_scale)
+            >,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 10u / item_scale)
+            >
+        >
+    >;
+};
+template<class Key>
+struct radix_sort_config_900<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, radix_sort_config<4, 3, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
+        select_type_case<sizeof(Key) == 2, radix_sort_config<6, 5, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 16> > >,
+        select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 17>, kernel_config<256, 15> > >,
+        select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 15>, kernel_config<256, 12> > >
+    > { };
+template<class Key, class Value>
+struct radix_sort_config_908
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    using scan = kernel_config<256, 2>;
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            radix_sort_config<4, 4, scan,
+            kernel_config<256, 10>, kernel_config<256, 19> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            radix_sort_config<6, 5, scan,
+            kernel_config<256, 10>, kernel_config<256, 17> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, kernel_config<256, 4>,
+            kernel_config<256, 15>, kernel_config<256, 15> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, kernel_config<256, 4>,
+            kernel_config<256, 14>, kernel_config<256, 12> >
+        >,
+        radix_sort_config<
+            6, 4, scan,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 15u / item_scale)
+            >,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 10u / item_scale)
+            >,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 10u / item_scale)
+            >
+        >
+    >;
+};
+template<class Key>
+struct radix_sort_config_908<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, radix_sort_config<4, 3, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
+        select_type_case<sizeof(Key) == 2, radix_sort_config<6, 5, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 17> > >,
+        select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 17>, kernel_config<256, 15> > >,
+        select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 15>, kernel_config<256, 12> > >
+    > { };
+// TODO: We need to update these parameters
+template<class Key, class Value>
+struct radix_sort_config_90a
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    using scan = kernel_config<256, 1>;
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            radix_sort_config<4, 4, scan,
+            kernel_config<256, 5>, kernel_config<256, 19> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            radix_sort_config<6, 5, scan,
+            kernel_config<256, 5>, kernel_config<256, 17> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, scan,
+            kernel_config<256, 7>, kernel_config<256, 15> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, scan,
+            kernel_config<256, 7>, kernel_config<256, 14> >
+        >,
+        radix_sort_config<
+            6, 4, scan,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 15u / item_scale)
+            >,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 10u / item_scale)
+            >
+        >
+    >;
+};
+template<class Key>
+struct radix_sort_config_90a<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, radix_sort_config<4, 3, kernel_config<256, 1>, kernel_config<256, 5>, kernel_config<256, 19> > >,
+        select_type_case<sizeof(Key) == 2, radix_sort_config<6, 5, kernel_config<256, 1>, kernel_config<256, 5>, kernel_config<256, 17> > >,
+        select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 1>, kernel_config<256, 8>, kernel_config<256, 15> > >,
+        select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 1>, kernel_config<256, 7>, kernel_config<256, 14> > >
+    > { };
+// TODO: We need to update these parameters
+template<class Key, class Value>
+struct radix_sort_config_1030
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    using scan = kernel_config<256, 2>;
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            radix_sort_config<4, 4, scan,
+            kernel_config<256, 10>, kernel_config<256, 19> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            radix_sort_config<6, 5, scan,
+            kernel_config<256, 10>, kernel_config<256, 17> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, scan,
+            kernel_config<256, 15>, kernel_config<256, 15> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, scan,
+            kernel_config<256, 15>, kernel_config<256, 14> >
+        >,
+        radix_sort_config<
+            6, 4, scan,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
+                ::rocprim::max(1u, 15u / item_scale)
+            >,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 10u / item_scale)
+            >,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 10u / item_scale)
+            >
+        >
+    >;
+};
+template<class Key>
+struct radix_sort_config_1030<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, radix_sort_config<4, 3, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
+        select_type_case<sizeof(Key) == 2, radix_sort_config<6, 5, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
+        select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 17>, kernel_config<256, 17> > >,
+        select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 15>, kernel_config<256, 15> > >
+    > { };
+template<unsigned int TargetArch, class Key, class Value>
+struct default_radix_sort_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<803, radix_sort_config_803<Key, Value> >,
+        select_arch_case<900, radix_sort_config_900<Key, Value> >,
+        select_arch_case<908, radix_sort_config_908<Key, Value> >,
+        select_arch_case<ROCPRIM_ARCH_90a, radix_sort_config_90a<Key, Value> >,
+        select_arch_case<1030, radix_sort_config_1030<Key, Value> >,
+        radix_sort_config_900<Key, Value>
+    > { };
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_RADIX_SORT_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_reduce.hpp
+++ b/3rdparty/cub/rocprim/device/device_reduce.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
+#define ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
+#include <type_traits>
+#include <iterator>
+#include <algorithm>
+#include <chrono>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../detail/match_result_type.hpp"
+#include "device_reduce_config.hpp"
+#include "detail/device_reduce.hpp"
+BEGIN_ROCPRIM_NAMESPACE
+/// \addtogroup devicemodule
+/// @{
+namespace detail
+{
+template<
+    bool WithInitialValue,
+    class Config,
+    class ResultType,
+    class InputIterator,
+    class OutputIterator,
+    class InitValueType,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
+void block_reduce_kernel(InputIterator input,
+                         const size_t size,
+                         OutputIterator output,
+                         InitValueType initial_value,
+                         BinaryFunction reduce_op)
+{
+    block_reduce_kernel_impl<WithInitialValue, Config, ResultType>(
+        input, size, output, initial_value, reduce_op
+    );
+}
+#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
+    if(debug_synchronous) \
+    { \
+        std::cout << name << "(" << size << ")"; \
+        auto _error = cudaStreamSynchronize(stream); \
+        if(_error != cudaSuccess) return _error; \
+        auto _end = std::chrono::high_resolution_clock::now(); \
+        auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+        std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+    }
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+template<
+    bool WithInitialValue, // true when inital_value should be used in reduction
+    class Config,
+    class InputIterator,
+    class OutputIterator,
+    class InitValueType,
+    class BinaryFunction
+>
+inline
+cudaError_t reduce_impl(void * temporary_storage,
+                       size_t& storage_size,
+                       InputIterator input,
+                       OutputIterator output,
+                       const InitValueType initial_value,
+                       const size_t size,
+                       BinaryFunction reduce_op,
+                       const cudaStream_t stream,
+                       bool debug_synchronous)
+{
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    using result_type = typename ::rocprim::detail::match_result_type<
+        input_type, BinaryFunction
+    >::type;
+    // Get default config if Config is default_config
+    using config = default_or_custom_config<
+        Config,
+        default_reduce_config<ROCPRIM_TARGET_ARCH, result_type>
+    >;
+    constexpr unsigned int block_size = config::block_size;
+    constexpr unsigned int items_per_thread = config::items_per_thread;
+    constexpr auto items_per_block = block_size * items_per_thread;
+    if(temporary_storage == nullptr)
+    {
+        storage_size = reduce_get_temporary_storage_bytes<result_type>(size, items_per_block);
+        // Make sure user won't try to allocate 0 bytes memory
+        storage_size = storage_size == 0 ? 4 : storage_size;
+        return cudaSuccess;
+    }
+    // Start point for time measurements
+    std::chrono::high_resolution_clock::time_point start;
+    static constexpr auto size_limit             = config::size_limit;
+    static constexpr auto number_of_blocks_limit = ::rocprim::max<size_t>(size_limit / items_per_block, 1);
+    auto number_of_blocks = (size + items_per_block - 1)/items_per_block;
+    if(debug_synchronous)
+    {
+        std::cout << "block_size " << block_size << '\n';
+        std::cout << "number of blocks " << number_of_blocks << '\n';
+        std::cout << "number of blocks limit " << number_of_blocks_limit << '\n';
+        std::cout << "items_per_block " << items_per_block << '\n';
+    }
+    if(number_of_blocks > 1)
+    {
+        // Pointer to array with block_prefixes
+        result_type * block_prefixes = static_cast<result_type*>(temporary_storage);
+        static constexpr auto aligned_size_limit = number_of_blocks_limit * items_per_block;
+        // Launch number_of_blocks_limit blocks while there is still at least as many blocks left as the limit
+        const auto number_of_launch = (size + aligned_size_limit - 1) / aligned_size_limit;
+        for(size_t i = 0, offset = 0; i < number_of_launch; ++i, offset += aligned_size_limit) {
+            const auto current_size = std::min<size_t>(size - offset, aligned_size_limit);
+            const auto current_blocks = (current_size + items_per_block - 1) / items_per_block;
+            if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+            detail::block_reduce_kernel<false, config, result_type>
+                <<<dim3(current_blocks),
+                dim3(block_size),
+                0,
+                stream>>>(
+                input + offset,
+                current_size,
+                block_prefixes + i * number_of_blocks_limit,
+                initial_value,
+                reduce_op);
+            ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", current_size, start);
+        }
+        void * nested_temp_storage = static_cast<void*>(block_prefixes + number_of_blocks);
+        auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type));
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+        auto error = reduce_impl<WithInitialValue, config>(
+            nested_temp_storage,
+            nested_temp_storage_size,
+            block_prefixes, // input
+            output, // output
+            initial_value,
+            number_of_blocks, // input size
+            reduce_op,
+            stream,
+            debug_synchronous
+        );
+        if(error != cudaSuccess) return error;
+        ROCPRIM_DETAIL_HIP_SYNC("nested_device_reduce", number_of_blocks, start);
+    }
+    else
+    {
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+        detail::block_reduce_kernel<WithInitialValue, config, result_type>
+            <<<dim3(1), dim3(block_size), 0, stream>>>(
+            input, size, output, initial_value, reduce_op
+        );
+        ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", size, start);
+    }
+    return cudaSuccess;
+}
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+#undef ROCPRIM_DETAIL_HIP_SYNC
+} // end of detail namespace
+/// \brief Parallel reduction primitive for device level.
+///
+/// reduce function performs a device-wide reduction operation
+/// using binary \p reduce_op operator.
+///
+/// \par Overview
+/// * Does not support non-commutative reduction operators. Reduction operator should also be
+/// associative. When used with non-associative functions the results may be non-deterministic
+/// and/or vary in precision.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input must have at least \p size elements, while \p output
+/// only needs one element.
+/// * By default, the input type is used for accumulation. A custom type
+/// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam InitValueType - type of the initial value.
+/// \tparam BinaryFunction - type of binary function used for reduction. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the reduction operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to reduce.
+/// \param [out] output - iterator to the first element in the output range. It can be
+/// same as \p input.
+/// \param [in] initial_value - initial value to start the reduction.
+/// \param [in] size - number of element in the input range.
+/// \param [in] reduce_op - binary operation function object that will be used for reduction.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level min-reduction operation is performed on an array of
+/// integer values (<tt>short</tt>s are reduced into <tt>int</tt>s) using custom operator.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // custom reduce function
+/// auto min_op =
+///     [] __device__ (int a, int b) -> int
+///     {
+///         return a < b ? a : b;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;    // e.g., 8
+/// short * input;        // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
+/// int * output;         // empty array of 1 element
+/// int start_value;      // e.g., 9
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, start_value, input_size, min_op
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform reduce
+/// rocprim::reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, start_value, input_size, min_op
+/// );
+/// // output: [1]
+/// \endcode
+///
+/// The same example as above, but now a custom accumulator type is specified.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// auto min_op =
+///     [] __device__ (int a, int b) -> int
+///     {
+///         return a < b ? a : b;
+///     };
+///
+/// size_t input_size;
+/// short * input;
+/// int * output;
+/// int start_value;
+///
+/// // Use a transform iterator to specifiy a custom accumulator type
+/// auto input_iterator = rocprim::make_transform_iterator(
+///     input, [] __device__ (T in) { return static_cast<int>(in); });
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Use the transform iterator
+/// rocprim::reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input_iterator, output, start_value, input_size, min_op
+/// );
+///
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// rocprim::reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input_iterator, output, start_value, input_size, min_op
+/// );
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class InitValueType,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
+>
+inline
+cudaError_t reduce(void * temporary_storage,
+                 size_t& storage_size,
+                 InputIterator input,
+                 OutputIterator output,
+                 const InitValueType initial_value,
+                 const size_t size,
+                 BinaryFunction reduce_op = BinaryFunction(),
+                 const cudaStream_t stream = 0,
+                 bool debug_synchronous = false)
+{
+    return detail::reduce_impl<true, Config>(
+        temporary_storage, storage_size,
+        input, output, initial_value, size,
+        reduce_op, stream, debug_synchronous
+    );
+}
+/// \brief Parallel reduce primitive for device level.
+///
+/// reduce function performs a device-wide reduction operation
+/// using binary \p reduce_op operator.
+///
+/// \par Overview
+/// * Does not support non-commutative reduction operators. Reduction operator should also be
+/// associative. When used with non-associative functions the results may be non-deterministic
+/// and/or vary in precision.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input must have at least \p size elements, while \p output
+/// only needs one element.
+/// * By default, the input type is used for accumulation. A custom type
+/// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam BinaryFunction - type of binary function used for reduction. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the reduction operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to reduce.
+/// \param [out] output - iterator to the first element in the output range. It can be
+/// same as \p input.
+/// \param [in] size - number of element in the input range.
+/// \param [in] reduce_op - binary operation function object that will be used for reduction.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// Default is BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level sum operation is performed on an array of
+/// integer values (<tt>short</tt>s are reduced into <tt>int</tt>s).
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;    // e.g., 8
+/// short * input;        // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int * output;         // empty array of 1 element
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size, rocprim::plus<int>()
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform reduce
+/// rocprim::reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size, rocprim::plus<int>()
+/// );
+/// // output: [36]
+/// \endcode
+///
+/// The same example as above, but now a custom accumulator type is specified.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// size_t input_size;
+/// short * input;
+/// int * output;
+///
+/// // Use a transform iterator to specifiy a custom accumulator type
+/// auto input_iterator = rocprim::make_transform_iterator(
+///     input, [] __device__ (T in) { return static_cast<int>(in); });
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Use the transform iterator
+/// rocprim::reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input_iterator, output, start_value, input_size, rocprim::plus<int>()
+/// );
+///
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// rocprim::reduce(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input_iterator, output, start_value, input_size, rocprim::plus<int>()
+/// );
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
+>
+inline
+cudaError_t reduce(void * temporary_storage,
+                  size_t& storage_size,
+                  InputIterator input,
+                  OutputIterator output,
+                  const size_t size,
+                  BinaryFunction reduce_op = BinaryFunction(),
+                  const cudaStream_t stream = 0,
+                  bool debug_synchronous = false)
+{
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    return detail::reduce_impl<false, Config>(
+        temporary_storage, storage_size,
+        input, output, input_type(), size,
+        reduce_op, stream, debug_synchronous
+    );
+}
+/// @}
+// end of group devicemodule
+END_ROCPRIM_NAMESPACE
+#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
--- a/3rdparty/cub/rocprim/device/device_reduce_by_key.hpp
+++ b/3rdparty/cub/rocprim/device/device_reduce_by_key.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_
+#define ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_
+#include <iterator>
+#include <iostream>
+#include <chrono>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../detail/match_result_type.hpp"
+#include "../functional.hpp"
+#include "device_reduce_by_key_config.hpp"
+#include "detail/device_reduce_by_key.hpp"
+BEGIN_ROCPRIM_NAMESPACE
+/// \addtogroup devicemodule
+/// @{
+namespace detail
+{
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class KeysInputIterator,
+    class KeyCompareFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void fill_unique_counts_kernel(KeysInputIterator keys_input,
+                               unsigned int size,
+                               unsigned int * unique_counts,
+                               KeyCompareFunction key_compare_op,
+                               unsigned int blocks_per_full_batch,
+                               unsigned int full_batches)
+{
+    fill_unique_counts<BlockSize, ItemsPerThread>(
+        keys_input, size,
+        unique_counts,
+        key_compare_op,
+        blocks_per_full_batch, full_batches
+    );
+}
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class UniqueCountOutputIterator
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void scan_unique_counts_kernel(unsigned int * unique_counts,
+                               UniqueCountOutputIterator unique_count_output,
+                               unsigned int batches)
+{
+    scan_unique_counts<BlockSize, ItemsPerThread>(unique_counts, unique_count_output, batches);
+}
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class KeysInputIterator,
+    class ValuesInputIterator,
+    class Result,
+    class UniqueOutputIterator,
+    class AggregatesOutputIterator,
+    class KeyCompareFunction,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void reduce_by_key_kernel(KeysInputIterator keys_input,
+                          ValuesInputIterator values_input,
+                          unsigned int size,
+                          const unsigned int * unique_starts,
+                          carry_out<Result> * carry_outs,
+                          Result * leading_aggregates,
+                          UniqueOutputIterator unique_output,
+                          AggregatesOutputIterator aggregates_output,
+                          KeyCompareFunction key_compare_op,
+                          BinaryFunction reduce_op,
+                          unsigned int blocks_per_full_batch,
+                          unsigned int full_batches)
+{
+    reduce_by_key<BlockSize, ItemsPerThread>(
+        keys_input, values_input, size,
+        unique_starts, carry_outs, leading_aggregates,
+        unique_output, aggregates_output,
+        key_compare_op, reduce_op,
+        blocks_per_full_batch, full_batches
+    );
+}
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    class Result,
+    class AggregatesOutputIterator,
+    class BinaryFunction
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void scan_and_scatter_carry_outs_kernel(const carry_out<Result> * carry_outs,
+                                        const Result * leading_aggregates,
+                                        AggregatesOutputIterator aggregates_output,
+                                        BinaryFunction reduce_op,
+                                        unsigned int batches)
+{
+    scan_and_scatter_carry_outs<BlockSize, ItemsPerThread>(
+        carry_outs, leading_aggregates, aggregates_output,
+        reduce_op,
+        batches
+    );
+}
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+template<
+    class Config,
+    class KeysInputIterator,
+    class ValuesInputIterator,
+    class UniqueOutputIterator,
+    class AggregatesOutputIterator,
+    class UniqueCountOutputIterator,
+    class BinaryFunction,
+    class KeyCompareFunction
+>
+inline
+cudaError_t reduce_by_key_impl(void * temporary_storage,
+                              size_t& storage_size,
+                              KeysInputIterator keys_input,
+                              ValuesInputIterator values_input,
+                              const unsigned int size,
+                              UniqueOutputIterator unique_output,
+                              AggregatesOutputIterator aggregates_output,
+                              UniqueCountOutputIterator unique_count_output,
+                              BinaryFunction reduce_op,
+                              KeyCompareFunction key_compare_op,
+                              const cudaStream_t stream,
+                              const bool debug_synchronous)
+{
+    using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+    using result_type = typename ::rocprim::detail::match_result_type<
+        typename std::iterator_traits<ValuesInputIterator>::value_type,
+        BinaryFunction
+    >::type;
+    using carry_out_type = carry_out<result_type>;
+    using config = default_or_custom_config<
+        Config,
+        default_reduce_by_key_config<ROCPRIM_TARGET_ARCH, key_type, result_type>
+    >;
+    constexpr unsigned int items_per_block = config::reduce::block_size * config::reduce::items_per_thread;
+    constexpr unsigned int scan_items_per_block = config::scan::block_size * config::scan::items_per_thread;
+    const unsigned int blocks = std::max(1u, ::rocprim::detail::ceiling_div(size, items_per_block));
+    const unsigned int blocks_per_full_batch = ::rocprim::detail::ceiling_div(blocks, scan_items_per_block);
+    const unsigned int full_batches = blocks % scan_items_per_block != 0
+        ? blocks % scan_items_per_block
+        : scan_items_per_block;
+    const unsigned int batches = (blocks_per_full_batch == 1 ? full_batches : scan_items_per_block);
+    const size_t unique_counts_bytes = ::rocprim::detail::align_size(batches * sizeof(unsigned int));
+    const size_t carry_outs_bytes = ::rocprim::detail::align_size(batches * sizeof(carry_out_type));
+    const size_t leading_aggregates_bytes = ::rocprim::detail::align_size(batches * sizeof(result_type));
+    if(temporary_storage == nullptr)
+    {
+        storage_size = unique_counts_bytes + carry_outs_bytes + leading_aggregates_bytes;
+        return cudaSuccess;
+    }
+    if(debug_synchronous)
+    {
+        std::cout << "blocks " << blocks << '\n';
+        std::cout << "blocks_per_full_batch " << blocks_per_full_batch << '\n';
+        std::cout << "full_batches " << full_batches << '\n';
+        std::cout << "batches " << batches << '\n';
+        std::cout << "storage_size " << storage_size << '\n';
+        cudaError_t error = cudaStreamSynchronize(stream);
+        if(error != cudaSuccess) return error;
+    }
+    char * ptr = reinterpret_cast<char *>(temporary_storage);
+    unsigned int * unique_counts = reinterpret_cast<unsigned int *>(ptr);
+    ptr += unique_counts_bytes;
+    carry_out_type * carry_outs = reinterpret_cast<carry_out_type *>(ptr);
+    ptr += carry_outs_bytes;
+    result_type * leading_aggregates = reinterpret_cast<result_type *>(ptr);
+    // Start point for time measurements
+    std::chrono::high_resolution_clock::time_point start;
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    fill_unique_counts_kernel<config::reduce::block_size, config::reduce::items_per_thread>
+        <<<dim3(batches), dim3(config::reduce::block_size), 0, stream>>>(
+        keys_input, size, unique_counts, key_compare_op,
+        blocks_per_full_batch, full_batches
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("fill_unique_counts", size, start)
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    scan_unique_counts_kernel<config::scan::block_size, config::scan::items_per_thread>
+        <<<dim3(1), dim3(config::scan::block_size), 0, stream>>>(
+        unique_counts, unique_count_output,
+        batches
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_unique_counts", config::scan::block_size, start)
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    reduce_by_key_kernel<config::reduce::block_size, config::reduce::items_per_thread>
+        <<<dim3(batches), dim3(config::reduce::block_size), 0, stream>>>(
+        keys_input, values_input, size,
+        const_cast<const unsigned int *>(unique_counts), carry_outs, leading_aggregates,
+        unique_output, aggregates_output,
+        key_compare_op, reduce_op,
+        blocks_per_full_batch, full_batches
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("reduce_by_key", size, start)
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    scan_and_scatter_carry_outs_kernel<config::scan::block_size, config::scan::items_per_thread>
+        <<<dim3(1), dim3(config::scan::block_size), 0, stream>>>(
+        const_cast<const carry_out_type *>(carry_outs), const_cast<const result_type *>(leading_aggregates),
+        aggregates_output,
+        reduce_op,
+        batches
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_and_scatter_carry_outs", config::scan::block_size, start)
+    return cudaSuccess;
+}
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+} // end of detail namespace
+/// \brief Parallel reduce-by-key primitive for device level.
+///
+/// reduce_by_key function performs a device-wide reduction operation of groups
+/// of consecutive values having the same key using binary \p reduce_op operator. The first key of each group
+/// is copied to \p unique_output and reduction of the group is written to \p aggregates_output.
+/// The total number of group is written to \p unique_count_output.
+///
+/// \par Overview
+/// * Supports non-commutative reduction operators. However, a reduction operator should be
+/// associative. When used with non-associative functions the results may be non-deterministic
+/// and/or vary in precision.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p keys_input and \p values_input must have at least \p size elements.
+/// * Range specified by \p unique_count_output must have at least 1 element.
+/// * Ranges specified by \p unique_output and \p aggregates_output must have at least
+/// <tt>*unique_count_output</tt> (i.e. the number of unique keys) elements.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_by_key_config or
+/// a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam UniqueOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam AggregatesOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam UniqueCountOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam BinaryFunction - type of binary function used for reduction. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p ValuesInputIterator.
+/// \tparam KeyCompareFunction - type of binary function used to determine keys equality. Default type
+/// is \p rocprim::equal_to<T>, where \p T is a \p value_type of \p KeysInputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the reduction operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - iterator to the first element in the range of keys.
+/// \param [in] values_input - iterator to the first element in the range of values to reduce.
+/// \param [in] size - number of element in the input range.
+/// \param [out] unique_output - iterator to the first element in the output range of unique keys.
+/// \param [out] aggregates_output - iterator to the first element in the output range of reductions.
+/// \param [out] unique_count_output - iterator to total number of groups.
+/// \param [in] reduce_op - binary operation function object that will be used for reduction.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// Default is BinaryFunction().
+/// \param [in] key_compare_op - binary operation function object that will be used to determine keys equality.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// Default is KeyCompareFunction().
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level sum operation is performed on an array of
+/// integer values and integer keys.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;          // e.g., 8
+/// int * keys_input;           // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
+/// int * values_input;         // e.g., [1, 2, 3, 4,  5,  6,  7,  8]
+/// int * unique_output;        // empty array of at least 4 elements
+/// int * aggregates_output;    // empty array of at least 4 elements
+/// int * unique_count_output;  // empty array of 1 element
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::reduce_by_key(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, values_input, input_size,
+///     unique_output, aggregates_output, unique_count_output
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform reduction
+/// rocprim::reduce_by_key(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, values_input, input_size,
+///     unique_output, aggregates_output, unique_count_output
+/// );
+/// // unique_output:       [1, 2, 10, 88]
+/// // aggregates_output:   [6, 4, 18,  8]
+/// // unique_count_output: [4]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class ValuesInputIterator,
+    class UniqueOutputIterator,
+    class AggregatesOutputIterator,
+    class UniqueCountOutputIterator,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>,
+    class KeyCompareFunction = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>
+>
+inline
+cudaError_t reduce_by_key(void * temporary_storage,
+                         size_t& storage_size,
+                         KeysInputIterator keys_input,
+                         ValuesInputIterator values_input,
+                         unsigned int size,
+                         UniqueOutputIterator unique_output,
+                         AggregatesOutputIterator aggregates_output,
+                         UniqueCountOutputIterator unique_count_output,
+                         BinaryFunction reduce_op = BinaryFunction(),
+                         KeyCompareFunction key_compare_op = KeyCompareFunction(),
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+{
+    return detail::reduce_by_key_impl<Config>(
+        temporary_storage, storage_size,
+        keys_input, values_input, size,
+        unique_output, aggregates_output, unique_count_output,
+        reduce_op, key_compare_op,
+        stream, debug_synchronous
+    );
+}
+/// @}
+// end of group devicemodule
+END_ROCPRIM_NAMESPACE
+#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_
--- a/3rdparty/cub/rocprim/device/device_reduce_by_key_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_reduce_by_key_config.hpp
+// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "config_types.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief Configuration of device-level reduce-by-key operation.
+///
+/// \tparam ScanConfig - configuration of carry-outs scan kernel. Must be \p kernel_config.
+/// \tparam ReduceConfig - configuration of the main reduce-by-key kernel. Must be \p kernel_config.
+template<
+    class ScanConfig,
+    class ReduceConfig
+>
+struct reduce_by_key_config
+{
+    /// \brief Configuration of carry-outs scan kernel.
+    using scan = ScanConfig;
+    /// \brief Configuration of the main reduce-by-key kernel.
+    using reduce = ReduceConfig;
+};
+namespace detail
+{
+template<class Key, class Value>
+struct reduce_by_key_config_803
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
+    using scan = kernel_config<256, 4>;
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) <= 8 && sizeof(Value) <= 8),
+            reduce_by_key_config<scan, kernel_config<256, 7> >
+        >,
+        reduce_by_key_config<scan, kernel_config<limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, ::rocprim::max(1u, 15u / item_scale)> >
+    >;
+};
+template<class Key, class Value>
+struct reduce_by_key_config_900
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
+    using scan = kernel_config<256, 2>;
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) <= 8 && sizeof(Value) <= 8),
+            reduce_by_key_config<scan, kernel_config<256, 10> >
+        >,
+        reduce_by_key_config<scan, kernel_config<limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, ::rocprim::max(1u, 15u / item_scale)> >
+    >;
+};
+// TODO: We need to update these parameters
+template<class Key, class Value>
+struct reduce_by_key_config_90a
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
+    using scan = kernel_config<256, 2>;
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) <= 8 && sizeof(Value) <= 8),
+            reduce_by_key_config<scan, kernel_config<256, 10> >
+        >,
+        reduce_by_key_config<scan, kernel_config<limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, ::rocprim::max(1u, 15u / item_scale)> >
+    >;
+};
+// TODO: We need to update these parameters
+template<class Key, class Value>
+struct reduce_by_key_config_1030
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
+    using scan = kernel_config<256, 2>;
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) <= 8 && sizeof(Value) <= 8),
+            reduce_by_key_config<scan, kernel_config<256, 10> >
+        >,
+        reduce_by_key_config<scan, kernel_config<limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_32>::value, ::rocprim::max(1u, 15u / item_scale)> >
+    >;
+};
+template<unsigned int TargetArch, class Key, class Value>
+struct default_reduce_by_key_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<803, reduce_by_key_config_803<Key, Value> >,
+        select_arch_case<900, reduce_by_key_config_900<Key, Value> >,
+        select_arch_case<ROCPRIM_ARCH_90a, reduce_by_key_config_90a<Key, Value> >,
+        select_arch_case<1030, reduce_by_key_config_1030<Key, Value> >,
+        reduce_by_key_config_900<Key, Value>
+    > { };
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_reduce_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_reduce_config.hpp
+// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_REDUCE_CONFIG_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../block/block_reduce.hpp"
+#include "config_types.hpp"
+#include "detail/device_config_helper.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+namespace detail
+{
+template<class Value>
+struct reduce_config_803
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    using type = reduce_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ::rocprim::block_reduce_algorithm::using_warp_reduce
+    >;
+};
+template<class Value>
+struct reduce_config_900
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    using type = reduce_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ::rocprim::block_reduce_algorithm::using_warp_reduce
+    >;
+};
+// TODO: We need to update these parameters
+template<class Value>
+struct reduce_config_90a
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    using type = reduce_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ::rocprim::block_reduce_algorithm::using_warp_reduce
+    >;
+};
+// TODO: We need to update these parameters
+template<class Value>
+struct reduce_config_1030
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    using type = reduce_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ::rocprim::block_reduce_algorithm::using_warp_reduce
+    >;
+};
+template<unsigned int TargetArch, class Value>
+struct default_reduce_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<803, reduce_config_803<Value>>,
+        select_arch_case<900, reduce_config_900<Value>>,
+        select_arch_case<ROCPRIM_ARCH_90a, reduce_config_90a<Value>>,
+        select_arch_case<1030, reduce_config_1030<Value>>,
+        reduce_config_900<Value>
+    > { };
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_run_length_encode.hpp
+++ b/3rdparty/cub/rocprim/device/device_run_length_encode.hpp
+// Copyright (c) 2018-2020 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
+#define ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
+#include <type_traits>
+#include <iterator>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../iterator/constant_iterator.hpp"
+#include "../iterator/counting_iterator.hpp"
+#include "../iterator/discard_iterator.hpp"
+#include "../iterator/zip_iterator.hpp"
+#include "device_run_length_encode_config.hpp"
+#include "device_reduce_by_key.hpp"
+#include "device_select.hpp"
+BEGIN_ROCPRIM_NAMESPACE
+/// \addtogroup devicemodule
+/// @{
+namespace detail
+{
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        if(error != cudaSuccess) return error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto error = cudaStreamSynchronize(stream); \
+            if(error != cudaSuccess) return error; \
+            auto end = std::chrono::high_resolution_clock::now(); \
+            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
+            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+} // end detail namespace
+/// \brief Parallel run-length encoding for device level.
+///
+/// run_length_encode function performs a device-wide run-length encoding of runs (groups)
+/// of consecutive values. The first value of each run is copied to \p unique_output and
+/// the length of the run is written to \p counts_output.
+/// The total number of runs is written to \p runs_count_output.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Range specified by \p input must have at least \p size elements.
+/// * Range specified by \p runs_count_output must have at least 1 element.
+/// * Ranges specified by \p unique_output and \p counts_output must have at least
+/// <tt>*runs_count_output</tt> (i.e. the number of runs) elements.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam UniqueOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam CountsOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam RunsCountOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range of values.
+/// \param [in] size - number of element in the input range.
+/// \param [out] unique_output - iterator to the first element in the output range of unique values.
+/// \param [out] counts_output - iterator to the first element in the output range of lenghts.
+/// \param [out] runs_count_output - iterator to total number of runs.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful operation; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level run-length encoding operation is performed on an array of
+/// integer values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;          // e.g., 8
+/// int * input;                // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
+/// int * unique_output;        // empty array of at least 4 elements
+/// int * counts_output;        // empty array of at least 4 elements
+/// int * runs_count_output;    // empty array of 1 element
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::run_length_encode(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, input_size,
+///     unique_output, counts_output, runs_count_output
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform encoding
+/// rocprim::run_length_encode(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, input_size,
+///     unique_output, counts_output, runs_count_output
+/// );
+/// // unique_output:     [1, 2, 10, 88]
+/// // counts_output:     [3, 1,  3,  1]
+/// // runs_count_output: [4]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class UniqueOutputIterator,
+    class CountsOutputIterator,
+    class RunsCountOutputIterator
+>
+inline
+cudaError_t run_length_encode(void * temporary_storage,
+                             size_t& storage_size,
+                             InputIterator input,
+                             unsigned int size,
+                             UniqueOutputIterator unique_output,
+                             CountsOutputIterator counts_output,
+                             RunsCountOutputIterator runs_count_output,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+{
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    using count_type = unsigned int;
+    using config = detail::default_or_custom_config<
+        Config,
+        detail::default_run_length_encode_config
+    >;
+    return ::rocprim::reduce_by_key<typename config::reduce_by_key>(
+        temporary_storage, storage_size,
+        input, make_constant_iterator<count_type>(1), size,
+        unique_output, counts_output, runs_count_output,
+        ::rocprim::plus<count_type>(), ::rocprim::equal_to<input_type>(),
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel run-length encoding of non-trivial runs for device level.
+///
+/// run_length_encode_non_trivial_runs function performs a device-wide run-length encoding of
+/// non-trivial runs (groups) of consecutive values (groups of more than one element).
+/// The offset of the first value of each non-trivial run is copied to \p offsets_output and
+/// the length of the run (the count of elements) is written to \p counts_output.
+/// The total number of non-trivial runs is written to \p runs_count_output.
+///
+/// \par Overview
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Range specified by \p input must have at least \p size elements.
+/// * Range specified by \p runs_count_output must have at least 1 element.
+/// * Ranges specified by \p offsets_output and \p counts_output must have at least
+/// <tt>*runs_count_output</tt> (i.e. the number of non-trivial runs) elements.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam OffsetsOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam CountsOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam RunsCountOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range of values.
+/// \param [in] size - number of element in the input range.
+/// \param [out] offsets_output - iterator to the first element in the output range of offsets.
+/// \param [out] counts_output - iterator to the first element in the output range of lenghts.
+/// \param [out] runs_count_output - iterator to total number of runs.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful operation; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level run-length encoding of non-trivial runs is performed on an array of
+/// integer values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;          // e.g., 8
+/// int * input;                // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
+/// int * offsets_output;       // empty array of at least 2 elements
+/// int * counts_output;        // empty array of at least 2 elements
+/// int * runs_count_output;    // empty array of 1 element
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::run_length_encode_non_trivial_runs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, input_size,
+///     offsets_output, counts_output, runs_count_output
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform encoding
+/// rocprim::run_length_encode_non_trivial_runs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, input_size,
+///     offsets_output, counts_output, runs_count_output
+/// );
+/// // offsets_output:    [0, 4]
+/// // counts_output:     [3, 3]
+/// // runs_count_output: [2]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OffsetsOutputIterator,
+    class CountsOutputIterator,
+    class RunsCountOutputIterator
+>
+inline
+cudaError_t run_length_encode_non_trivial_runs(void * temporary_storage,
+                                              size_t& storage_size,
+                                              InputIterator input,
+                                              unsigned int size,
+                                              OffsetsOutputIterator offsets_output,
+                                              CountsOutputIterator counts_output,
+                                              RunsCountOutputIterator runs_count_output,
+                                              cudaStream_t stream = 0,
+                                              bool debug_synchronous = false)
+{
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    using offset_type = unsigned int;
+    using count_type = unsigned int;
+    using offset_count_pair = typename ::rocprim::tuple<offset_type, count_type>;
+    using config = detail::default_or_custom_config<
+        Config,
+        detail::default_run_length_encode_config
+    >;
+    cudaError_t error;
+    auto reduce_op = [] __device__ (const offset_count_pair& a, const offset_count_pair& b)
+    {
+        return offset_count_pair(
+            ::rocprim::get<0>(a), // Always use offset of the first item of the run
+            ::rocprim::get<1>(a) + ::rocprim::get<1>(b) // Number of items in the run
+        );
+    };
+    auto non_trivial_runs_select_op = [] __device__ (const offset_count_pair& a)
+    {
+        return ::rocprim::get<1>(a) > 1;
+    };
+    offset_type * offsets_tmp = nullptr;
+    count_type * counts_tmp = nullptr;
+    count_type * all_runs_count_tmp = nullptr;
+    // Calculate size of temporary storage for reduce_by_key operation
+    size_t reduce_by_key_bytes;
+    error = ::rocprim::reduce_by_key<typename config::reduce_by_key>(
+        nullptr, reduce_by_key_bytes,
+        input,
+        ::rocprim::make_zip_iterator(
+            ::rocprim::make_tuple(
+                ::rocprim::make_counting_iterator<offset_type>(0),
+                ::rocprim::make_constant_iterator<count_type>(1)
+            )
+        ),
+        size,
+        ::rocprim::make_discard_iterator(),
+        ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_tmp, counts_tmp)),
+        all_runs_count_tmp,
+        reduce_op, ::rocprim::equal_to<input_type>(),
+        stream, debug_synchronous
+    );
+    if(error != cudaSuccess) return error;
+    reduce_by_key_bytes = ::rocprim::detail::align_size(reduce_by_key_bytes);
+    // Calculate size of temporary storage for select operation
+    size_t select_bytes;
+    error = ::rocprim::select<typename config::select>(
+        nullptr, select_bytes,
+        ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_tmp, counts_tmp)),
+        ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_output, counts_output)),
+        runs_count_output,
+        size,
+        non_trivial_runs_select_op,
+        stream, debug_synchronous
+    );
+    if(error != cudaSuccess) return error;
+    select_bytes = ::rocprim::detail::align_size(select_bytes);
+    const size_t offsets_tmp_bytes = ::rocprim::detail::align_size(size * sizeof(offset_type));
+    const size_t counts_tmp_bytes = ::rocprim::detail::align_size(size * sizeof(count_type));
+    const size_t all_runs_count_tmp_bytes = sizeof(count_type);
+    if(temporary_storage == nullptr)
+    {
+        storage_size = ::rocprim::max(reduce_by_key_bytes, select_bytes) +
+            offsets_tmp_bytes + counts_tmp_bytes + all_runs_count_tmp_bytes;
+        return cudaSuccess;
+    }
+    char * ptr = reinterpret_cast<char *>(temporary_storage);
+    ptr += ::rocprim::max(reduce_by_key_bytes, select_bytes);
+    offsets_tmp = reinterpret_cast<offset_type *>(ptr);
+    ptr += offsets_tmp_bytes;
+    counts_tmp = reinterpret_cast<count_type *>(ptr);
+    ptr += counts_tmp_bytes;
+    all_runs_count_tmp = reinterpret_cast<count_type *>(ptr);
+    std::chrono::high_resolution_clock::time_point start;
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    error = ::rocprim::reduce_by_key<typename config::reduce_by_key>(
+        temporary_storage, reduce_by_key_bytes,
+        input,
+        ::rocprim::make_zip_iterator(
+            ::rocprim::make_tuple(
+                ::rocprim::make_counting_iterator<offset_type>(0),
+                ::rocprim::make_constant_iterator<count_type>(1)
+            )
+        ),
+        size,
+        ::rocprim::make_discard_iterator(), // Ignore unique output
+        ::rocprim::make_zip_iterator(rocprim::make_tuple(offsets_tmp, counts_tmp)),
+        all_runs_count_tmp,
+        reduce_op, ::rocprim::equal_to<input_type>(),
+        stream, debug_synchronous
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("rocprim::reduce_by_key", size, start)
+    // Read count of all runs (including trivial runs)
+    count_type all_runs_count;
+    // cudaMemcpyWithStream is only supported on rocm 3.1 and above
+    error = cudaMemcpyAsync(&all_runs_count, all_runs_count_tmp, sizeof(count_type), cudaMemcpyDeviceToHost, stream);
+    if(error != cudaSuccess) return error;
+    error = cudaStreamSynchronize(stream);
+    // Select non-trivial runs
+    if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+    error = ::rocprim::select<typename config::select>(
+        temporary_storage, select_bytes,
+        ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_tmp, counts_tmp)),
+        ::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_output, counts_output)),
+        runs_count_output,
+        all_runs_count,
+        non_trivial_runs_select_op,
+        stream, debug_synchronous
+    );
+    ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("rocprim::select", all_runs_count, start)
+    return cudaSuccess;
+}
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+/// @}
+// end of group devicemodule
+END_ROCPRIM_NAMESPACE
+#endif // ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
--- a/3rdparty/cub/rocprim/device/device_run_length_encode_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_run_length_encode_config.hpp
+// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_CONFIG_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "config_types.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief Configuration of device-level run-length encoding operation.
+///
+/// \tparam ReduceByKeyConfig - configuration of device-level reduce-by-key operation.
+/// Must be \p reduce_by_key_config or \p default_config.
+/// \tparam SelectConfig - configuration of device-level select operation.
+/// Must be \p select_config or \p default_config.
+template<
+    class ReduceByKeyConfig,
+    class SelectConfig = default_config
+>
+struct run_length_encode_config
+{
+    /// \brief Configuration of device-level reduce-by-key operation.
+    using reduce_by_key = ReduceByKeyConfig;
+    /// \brief Configuration of device-level select operation.
+    using select = SelectConfig;
+};
+namespace detail
+{
+using default_run_length_encode_config = run_length_encode_config<default_config, default_config>;
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_scan.hpp
+++ b/3rdparty/cub/rocprim/device/device_scan.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
+#include <type_traits>
+#include <iterator>
+#include "../config.hpp"
+#include "../functional.hpp"
+#include "../type_traits.hpp"
+#include "../types/future_value.hpp"
+#include "../detail/various.hpp"
+#include "device_scan_config.hpp"
+#include "device_transform.hpp"
+#include "detail/device_scan_common.hpp"
+#include "detail/device_scan_lookback.hpp"
+#include "detail/device_scan_reduce_then_scan.hpp"
+BEGIN_ROCPRIM_NAMESPACE
+/// \addtogroup devicemodule
+/// @{
+namespace detail
+{
+// Single kernel scan (performs scan on one thread block only)
+template<
+    bool Exclusive,
+    class Config,
+    class InputIterator,
+    class OutputIterator,
+    class BinaryFunction,
+    class InitValueType
+>
+ROCPRIM_KERNEL
+__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
+void single_scan_kernel(InputIterator input,
+                        const size_t size,
+                        const InitValueType initial_value,
+                        OutputIterator output,
+                        BinaryFunction scan_op)
+{
+    single_scan_kernel_impl<Exclusive, Config>(
+        input, size, get_input_value(initial_value), output, scan_op
+    );
+}
+// Reduce-then-scan kernels
+// Calculates block prefixes that will be used in final_scan_kernel
+// when performing block scan operations.
+template<
+    class Config,
+    class InputIterator,
+    class BinaryFunction,
+    class ResultType
+>
+ROCPRIM_KERNEL
+__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
+void block_reduce_kernel(InputIterator input,
+                         BinaryFunction scan_op,
+                         ResultType * block_prefixes)
+{
+    block_reduce_kernel_impl<Config>(
+        input, scan_op, block_prefixes
+    );
+}
+template<
+    bool Exclusive,
+    class Config,
+    class InputIterator,
+    class OutputIterator,
+    class BinaryFunction,
+    class InitValueType
+>
+ROCPRIM_KERNEL
+__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
+void final_scan_kernel(InputIterator input,
+                       const size_t size,
+                       OutputIterator output,
+                       const InitValueType initial_value,
+                       BinaryFunction scan_op,
+                       input_type_t<InitValueType>* block_prefixes,
+                       input_type_t<InitValueType>* previous_last_element = nullptr,
+                       input_type_t<InitValueType>* new_last_element = nullptr,
+                       bool override_first_value = false,
+                       bool save_last_value = false)
+{
+    final_scan_kernel_impl<Exclusive, Config>(
+        input, size, output, get_input_value(initial_value),
+        scan_op, block_prefixes,
+        previous_last_element, new_last_element,
+        override_first_value, save_last_value
+    );
+}
+// Single pass (look-back kernels)
+template<
+    bool Exclusive,
+    class Config,
+    class InputIterator,
+    class OutputIterator,
+    class BinaryFunction,
+    class InitValueType,
+    class LookBackScanState
+>
+ROCPRIM_KERNEL
+__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
+void lookback_scan_kernel(InputIterator input,
+                          OutputIterator output,
+                          const size_t size,
+                          const InitValueType initial_value,
+                          BinaryFunction scan_op,
+                          LookBackScanState lookback_scan_state,
+                          const unsigned int number_of_blocks,
+                          ordered_block_id<unsigned int> ordered_bid,
+                          input_type_t<InitValueType>* previous_last_element = nullptr,
+                          input_type_t<InitValueType>* new_last_element = nullptr,
+                          bool override_first_value = false,
+                          bool save_last_value = false)
+{
+    lookback_scan_kernel_impl<Exclusive, Config>(
+        input, output, size, get_input_value(initial_value), scan_op,
+        lookback_scan_state, number_of_blocks, ordered_bid,
+        previous_last_element, new_last_element,
+        override_first_value, save_last_value
+    );
+}
+#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
+    if(debug_synchronous) \
+    { \
+        std::cout << name << "(" << size << ")"; \
+        auto error = cudaStreamSynchronize(stream); \
+        if(error != cudaSuccess) return error; \
+        auto end = std::chrono::high_resolution_clock::now(); \
+        auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
+        std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+    }
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+template<
+    bool Exclusive,
+    class Config,
+    class InputIterator,
+    class OutputIterator,
+    class InitValueType,
+    class BinaryFunction
+>
+inline
+auto scan_impl(void * temporary_storage,
+               size_t& storage_size,
+               InputIterator input,
+               OutputIterator output,
+               const InitValueType initial_value,
+               const size_t size,
+               BinaryFunction scan_op,
+               const cudaStream_t stream,
+               bool debug_synchronous)
+    -> typename std::enable_if<!Config::use_lookback, cudaError_t>::type
+{
+    using config = Config;
+    using real_init_value_type = input_type_t<InitValueType>;
+    constexpr unsigned int block_size = config::block_size;
+    constexpr unsigned int items_per_thread = config::items_per_thread;
+    constexpr auto items_per_block = block_size * items_per_thread;
+    static constexpr size_t size_limit = config::size_limit;
+    static constexpr size_t aligned_size_limit = ::rocprim::max<size_t>(size_limit - size_limit % items_per_block, items_per_block);
+    size_t limited_size = std::min<size_t>(size, aligned_size_limit);
+    const bool use_limited_size = limited_size == aligned_size_limit;
+    size_t nested_prefixes_size_bytes = scan_get_temporary_storage_bytes<real_init_value_type>(limited_size, items_per_block);
+    // Calculate required temporary storage
+    if(temporary_storage == nullptr)
+    {
+        storage_size = nested_prefixes_size_bytes;
+        if(use_limited_size)
+            storage_size += 4 * sizeof(real_init_value_type);
+        // Make sure user won't try to allocate 0 bytes memory, because
+        // cudaMalloc will return nullptr when size is zero.
+        storage_size = storage_size == 0 ? 4 : storage_size;
+        return cudaSuccess;
+    }
+    // Start point for time measurements
+    std::chrono::high_resolution_clock::time_point start;
+    auto number_of_blocks = (size + items_per_block - 1)/items_per_block;
+    if( number_of_blocks == 0u )
+        return cudaSuccess;
+    if(number_of_blocks > 1)
+    {
+        unsigned int number_of_launch = (size + limited_size - 1)/limited_size;
+        for (size_t i = 0, offset = 0; i < number_of_launch; i++, offset+=limited_size )
+        {
+            size_t current_size = std::min<size_t>(size - offset, limited_size);
+            number_of_blocks = (current_size + items_per_block - 1)/items_per_block;
+            if(debug_synchronous)
+            {
+                std::cout << "use_limited_size " << use_limited_size << '\n';
+                std::cout << "number_of_launch " << number_of_launch << '\n';
+                std::cout << "inex " << i << '\n';
+                std::cout << "aligned_size_limit " << aligned_size_limit << '\n';
+                std::cout << "size " << current_size << '\n';
+                std::cout << "block_size " << block_size << '\n';
+                std::cout << "number of blocks " << number_of_blocks << '\n';
+                std::cout << "items_per_block " << items_per_block << '\n';
+                std::cout.flush();
+            }
+            // Pointer to array with block_prefixes
+            char * ptr = reinterpret_cast<char *>(temporary_storage);
+            real_init_value_type* block_prefixes = reinterpret_cast<real_init_value_type*>(ptr);
+            real_init_value_type* previous_last_element = nullptr;
+            real_init_value_type* new_last_element = nullptr;
+            if(use_limited_size)
+            {
+                ptr += nested_prefixes_size_bytes;
+                previous_last_element = reinterpret_cast<real_init_value_type*>(ptr);
+                ptr += sizeof(real_init_value_type);
+                new_last_element = reinterpret_cast<real_init_value_type*>(ptr);
+            }
+            // Grid size for block_reduce_kernel, we don't need to calculate reduction
+            // of the last block as it will never be used as prefix for other blocks
+            auto grid_size = number_of_blocks - 1;
+            if( grid_size != 0 )
+            {
+                if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+                detail::block_reduce_kernel<
+                        config, InputIterator, BinaryFunction, real_init_value_type
+                    >
+                    <<<dim3(grid_size), dim3(block_size), 0, stream>>>(
+                    input + offset, scan_op, block_prefixes
+                );
+                ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", current_size, start)
+                if( !Exclusive && i > 0 )
+                {
+                    cudaError_t error = ::rocprim::transform(
+                        previous_last_element, block_prefixes, block_prefixes, 1,
+                        scan_op, stream, debug_synchronous
+                    );
+                    if(error != cudaSuccess) return error;
+                }
+                // TODO: Performance may increase if for (number_of_blocks < 8192) (or some other
+                // threshold) we would just use CPU to calculate prefixes.
+                // Calculate size of temporary storage for nested device scan operation
+                void * nested_temp_storage = static_cast<void*>(block_prefixes + number_of_blocks);
+                auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(real_init_value_type));
+                if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+                auto error = scan_impl<false, config>(
+                    nested_temp_storage,
+                    nested_temp_storage_size,
+                    block_prefixes, // input
+                    block_prefixes, // output
+                    real_init_value_type(), // dummy initial value
+                    number_of_blocks, // input size
+                    scan_op,
+                    stream,
+                    debug_synchronous
+                );
+                if(error != cudaSuccess) return error;
+                ROCPRIM_DETAIL_HIP_SYNC("nested_device_scan", number_of_blocks, start);
+            }
+            // Grid size for final_scan_kernel
+            grid_size = number_of_blocks;
+            if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+            detail::final_scan_kernel<
+                    Exclusive, // flag for exclusive scan operation
+                    config, // kernel configuration (block size, ipt)
+                    InputIterator, OutputIterator,
+                    BinaryFunction, InitValueType
+                >
+                <<<dim3(grid_size), dim3(block_size), 0, stream>>>(
+                input + offset,
+                current_size,
+                output + offset,
+                initial_value,
+                scan_op,
+                block_prefixes,
+                previous_last_element,
+                new_last_element,
+                i != size_t(0) && ((!Exclusive && number_of_blocks == 1) || Exclusive),
+                number_of_launch > 1
+            );
+            ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("final_scan_kernel", size, start);
+            // Swap the last_elements if it's necessary
+            if(number_of_launch > 1)
+            {
+                cudaError_t error = ::rocprim::transform(
+                    new_last_element, previous_last_element, 1,
+                    ::rocprim::identity<real_init_value_type>(),
+                    stream, debug_synchronous
+                );
+                if(error != cudaSuccess) return error;
+            }
+        }
+    }
+    else
+    {
+        if(debug_synchronous)
+        {
+            std::cout << "block_size " << block_size << '\n';
+            std::cout << "number of blocks " << number_of_blocks << '\n';
+            std::cout << "items_per_block " << items_per_block << '\n';
+        }
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+        detail::single_scan_kernel<
+                Exclusive, // flag for exclusive scan operation
+                config, // kernel configuration (block size, ipt)
+                InputIterator, OutputIterator, BinaryFunction
+            >
+            <<<dim3(1), dim3(block_size), 0, stream>>>(
+            input, size, initial_value, output, scan_op
+        );
+        ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("single_scan_kernel", size, start);
+    }
+    return cudaSuccess;
+}
+template<
+    bool Exclusive,
+    class Config,
+    class InputIterator,
+    class OutputIterator,
+    class InitValueType,
+    class BinaryFunction
+>
+inline
+auto scan_impl(void * temporary_storage,
+               size_t& storage_size,
+               InputIterator input,
+               OutputIterator output,
+               const InitValueType initial_value,
+               const size_t size,
+               BinaryFunction scan_op,
+               const cudaStream_t stream,
+               bool debug_synchronous)
+    -> typename std::enable_if<Config::use_lookback, cudaError_t>::type
+{
+    using config = Config;
+    using real_init_value_type = input_type_t<InitValueType>;
+    using scan_state_type = detail::lookback_scan_state<real_init_value_type>;
+    using scan_state_with_sleep_type = detail::lookback_scan_state<real_init_value_type, true>;
+    using ordered_block_id_type = detail::ordered_block_id<unsigned int>;
+    constexpr unsigned int block_size = config::block_size;
+    constexpr unsigned int items_per_thread = config::items_per_thread;
+    constexpr auto items_per_block = block_size * items_per_thread;
+    static constexpr size_t size_limit = config::size_limit;
+    static constexpr size_t aligned_size_limit = ::rocprim::max<size_t>(size_limit - size_limit % items_per_block, items_per_block);
+    size_t limited_size = std::min<size_t>(size, aligned_size_limit);
+    const bool use_limited_size = limited_size == aligned_size_limit;
+    unsigned int number_of_blocks = (limited_size + items_per_block - 1)/items_per_block;
+    // Calculate required temporary storage
+    size_t scan_state_bytes = ::rocprim::detail::align_size(
+        // This is valid even with scan_state_with_sleep_type
+        scan_state_type::get_storage_size(number_of_blocks)
+    );
+    size_t ordered_block_id_bytes = ordered_block_id_type::get_storage_size();
+    if(temporary_storage == nullptr)
+    {
+        // storage_size is never zero
+        storage_size = scan_state_bytes + ordered_block_id_bytes;
+        if(use_limited_size)
+            storage_size += 2 * sizeof(real_init_value_type);
+        return cudaSuccess;
+    }
+    // Start point for time measurements
+    std::chrono::high_resolution_clock::time_point start;
+    if( number_of_blocks == 0u )
+        return cudaSuccess;
+    if(number_of_blocks > 1 || use_limited_size)
+    {
+        // Create and initialize lookback_scan_state obj
+        auto scan_state = scan_state_type::create(temporary_storage, number_of_blocks);
+        auto scan_state_with_sleep = scan_state_with_sleep_type::create(temporary_storage, number_of_blocks);
+        // Create ad initialize ordered_block_id obj
+        auto ptr = reinterpret_cast<char*>(temporary_storage);
+        auto ordered_bid = ordered_block_id_type::create(
+            reinterpret_cast<ordered_block_id_type::id_type*>(ptr + scan_state_bytes)
+        );
+        // The last element
+        real_init_value_type* previous_last_element = nullptr;
+        real_init_value_type* new_last_element = nullptr;
+        if(use_limited_size)
+        {
+            ptr += storage_size - sizeof(real_init_value_type);
+            new_last_element = reinterpret_cast<real_init_value_type*>(ptr);
+            ptr -= sizeof(real_init_value_type);
+            previous_last_element = reinterpret_cast<real_init_value_type*>(ptr);
+        }
+        cudaDeviceProp prop;
+        int deviceId;
+        static_cast<void>(cudaGetDevice(&deviceId));
+        static_cast<void>(cudaGetDeviceProperties(&prop, deviceId));
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+        int asicRevision = 0;
+        size_t number_of_launch = (size + limited_size - 1)/limited_size;
+        for (size_t i = 0, offset = 0; i < number_of_launch; i++, offset+=limited_size )
+        {
+            size_t current_size = std::min<size_t>(size - offset, limited_size);
+            number_of_blocks = (current_size + items_per_block - 1)/items_per_block;
+            auto grid_size = (number_of_blocks + block_size - 1)/block_size;
+            if(debug_synchronous)
+            {
+                std::cout << "use_limited_size " << use_limited_size << '\n';
+                std::cout << "aligned_size_limit " << aligned_size_limit << '\n';
+                std::cout << "number_of_launch " << number_of_launch << '\n';
+                std::cout << "index " << i << '\n';
+                std::cout << "size " << current_size << '\n';
+                std::cout << "block_size " << block_size << '\n';
+                std::cout << "number of blocks " << number_of_blocks << '\n';
+                std::cout << "items_per_block " << items_per_block << '\n';
+            }
+                init_lookback_scan_state_kernel<scan_state_type>
+                    <<<dim3(grid_size), dim3(block_size), 0, stream>>>(
+                    scan_state, number_of_blocks, ordered_bid
+                );
+            ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_lookback_scan_state_kernel", number_of_blocks, start)
+            if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+            grid_size = number_of_blocks;
+                if(debug_synchronous)
+                {
+                    std::cout << "use_limited_size " << use_limited_size << '\n';
+                    std::cout << "aligned_size_limit " << aligned_size_limit << '\n';
+                    std::cout << "size " << current_size << '\n';
+                    std::cout << "block_size " << block_size << '\n';
+                    std::cout << "number of blocks " << number_of_blocks << '\n';
+                    std::cout << "items_per_block " << items_per_block << '\n';
+                }
+                lookback_scan_kernel<
+                        Exclusive, // flag for exclusive scan operation
+                        config, // kernel configuration (block size, ipt)
+                        InputIterator, OutputIterator,
+                        BinaryFunction, InitValueType, scan_state_type
+                    >
+                    <<<dim3(grid_size), dim3(block_size), 0, stream>>>(
+                    input + offset, output + offset, current_size, initial_value,
+                    scan_op, scan_state, number_of_blocks, ordered_bid,
+                    previous_last_element, new_last_element,
+                    i != size_t(0), number_of_launch > 1
+                );
+            ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("lookback_scan_kernel", current_size, start)
+            // Swap the last_elements
+            if(number_of_launch > 1)
+            {
+                cudaError_t error = ::rocprim::transform(
+                    new_last_element, previous_last_element, 1,
+                    ::rocprim::identity<real_init_value_type>(),
+                    stream, debug_synchronous
+                );
+                if(error != cudaSuccess) return error;
+            }
+        }
+    }
+    else
+    {
+        if(debug_synchronous)
+        {
+            std::cout << "size " << size << '\n';
+            std::cout << "block_size " << block_size << '\n';
+            std::cout << "number of blocks " << number_of_blocks << '\n';
+            std::cout << "items_per_block " << items_per_block << '\n';
+        }
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+        single_scan_kernel<
+                Exclusive, // flag for exclusive scan operation
+                config, // kernel configuration (block size, ipt)
+                InputIterator, OutputIterator, BinaryFunction
+            >
+            <<<dim3(1), dim3(block_size), 0, stream>>>(
+            input, size, initial_value, output, scan_op
+        );
+        ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("single_scan_kernel", size, start);
+    }
+    return cudaSuccess;
+}
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+#undef ROCPRIM_DETAIL_HIP_SYNC
+} // end of detail namespace
+/// \brief Parallel inclusive scan primitive for device level.
+///
+/// inclusive_scan function performs a device-wide inclusive prefix scan operation
+/// using binary \p scan_op operator.
+///
+/// \par Overview
+/// * Supports non-commutative scan operators. However, a scan operator should be
+/// associative. When used with non-associative functions the results may be non-deterministic
+/// and/or vary in precision.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input and \p output must have at least \p size elements.
+/// * By default, the input type is used for accumulation. A custom type
+/// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam BinaryFunction - type of binary function used for scan. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the scan operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to scan.
+/// \param [out] output - iterator to the first element in the output range. It can be
+/// same as \p input.
+/// \param [in] size - number of element in the input range.
+/// \param [in] scan_op - binary operation function object that will be used for scan.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// Default is BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level inclusive sum operation is performed on an array of
+/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s).
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;    // e.g., 8
+/// short * input;        // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int * output;         // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::inclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size, rocprim::plus<int>()
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform scan
+/// rocprim::inclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size, rocprim::plus<int>()
+/// );
+/// // output: [1, 3, 6, 10, 15, 21, 28, 36]
+/// \endcode
+///
+/// The same example as above, but now a custom accumulator type is specified.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// size_t input_size;
+/// short * input;
+/// int * output;
+///
+/// // Use a transform iterator to specifiy a custom accumulator type
+/// auto input_iterator = rocprim::make_transform_iterator(
+///     input, [] __device__ (T in) { return static_cast<int>(in); });
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Use the transform iterator
+/// rocprim::inclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input_iterator, output, input_size, rocprim::plus<int>()
+/// );
+///
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// rocprim::inclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input_iterator, output, input_size, rocprim::plus<int>()
+/// );
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
+>
+inline
+cudaError_t inclusive_scan(void * temporary_storage,
+                          size_t& storage_size,
+                          InputIterator input,
+                          OutputIterator output,
+                          const size_t size,
+                          BinaryFunction scan_op = BinaryFunction(),
+                          const cudaStream_t stream = 0,
+                          bool debug_synchronous = false)
+{
+    using input_type = typename std::iterator_traits<InputIterator>::value_type;
+    // Get default config if Config is default_config
+    using config = detail::default_or_custom_config<
+        Config,
+        detail::default_scan_config<ROCPRIM_TARGET_ARCH, input_type>
+        >;
+    return detail::scan_impl<false, config>(
+        temporary_storage, storage_size,
+        // input_type() is a dummy initial value (not used)
+        input, output, input_type(), size,
+        scan_op, stream, debug_synchronous
+    );
+}
+/// \brief Parallel exclusive scan primitive for device level.
+///
+/// exclusive_scan function performs a device-wide exclusive prefix scan operation
+/// using binary \p scan_op operator.
+///
+/// \par Overview
+/// * Supports non-commutative scan operators. However, a scan operator should be
+/// associative. When used with non-associative functions the results may be non-deterministic
+/// and/or vary in precision.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p input and \p output must have at least \p size elements.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
+/// a custom class with the same members.
+/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam InitValueType - type of the initial value.
+/// \tparam BinaryFunction - type of binary function used for scan. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the scan operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] input - iterator to the first element in the range to scan.
+/// \param [out] output - iterator to the first element in the output range. It can be
+/// same as \p input.
+/// \param [in] initial_value - initial value to start the scan.
+/// A rocpim::future_value may be passed to use a value that will be later computed.
+/// \param [in] size - number of element in the input range.
+/// \param [in] scan_op - binary operation function object that will be used for scan.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// The default value is \p BinaryFunction().
+/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. The default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level exclusive min-scan operation is performed on an array of
+/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s) using custom operator.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // custom scan function
+/// auto min_op =
+///     [] __device__ (int a, int b) -> int
+///     {
+///         return a < b ? a : b;
+///     };
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;    // e.g., 8
+/// short * input;        // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
+/// int * output;         // empty array of 8 elements
+/// int start_value;      // e.g., 9
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::exclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, start_value, input_size, min_op
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform scan
+/// rocprim::exclusive_scan(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, start_value, input_size, min_op
+/// );
+/// // output: [9, 4, 7, 6, 2, 2, 1, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class InputIterator,
+    class OutputIterator,
+    class InitValueType,
+    class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
+>
+inline
+cudaError_t exclusive_scan(void * temporary_storage,
+                          size_t& storage_size,
+                          InputIterator input,
+                          OutputIterator output,
+                          const InitValueType initial_value,
+                          const size_t size,
+                          BinaryFunction scan_op = BinaryFunction(),
+                          const cudaStream_t stream = 0,
+                          bool debug_synchronous = false)
+{
+    using real_init_value_type = detail::input_type_t<InitValueType>;
+    // Get default config if Config is default_config
+    using config = detail::default_or_custom_config<
+        Config,
+        detail::default_scan_config<ROCPRIM_TARGET_ARCH, real_init_value_type>
+    >;
+    return detail::scan_impl<true, config>(
+        temporary_storage, storage_size,
+        input, output, initial_value, size,
+        scan_op, stream, debug_synchronous
+    );
+}
+/// @}
+// end of group devicemodule
+END_ROCPRIM_NAMESPACE
+#endif // ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
--- a/3rdparty/cub/rocprim/device/device_scan_by_key.hpp
+++ b/3rdparty/cub/rocprim/device/device_scan_by_key.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_
+#include "detail/device_scan_by_key.hpp"
+#include "detail/lookback_scan_state.hpp"
+#include "detail/ordered_block_id.hpp"
+#include "config_types.hpp"
+#include "device_scan_by_key_config.hpp"
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../functional.hpp"
+#include "../types/future_value.hpp"
+#include "../types/tuple.hpp"
+#include <cuda_runtime.h>
+#include <iostream>
+#include <iterator>
+#include <type_traits>
+BEGIN_ROCPRIM_NAMESPACE
+namespace detail
+{
+    template <bool Exclusive,
+              typename Config,
+              typename KeyInputIterator,
+              typename InputIterator,
+              typename OutputIterator,
+              typename InitialValueType,
+              typename CompareFunction,
+              typename BinaryFunction,
+              typename LookbackScanState,
+              typename ResultType>
+    void __global__ __launch_bounds__(Config::block_size) device_scan_by_key_kernel(
+        const KeyInputIterator                          keys,
+        const InputIterator                             values,
+        const OutputIterator                            output,
+        const InitialValueType                          initial_value,
+        const CompareFunction                           compare,
+        const BinaryFunction                            scan_op,
+        const LookbackScanState                         scan_state,
+        const size_t                                    size,
+        const size_t                                    starting_block,
+        const size_t                                    number_of_blocks,
+        const ordered_block_id<unsigned int>            ordered_bid,
+        const ::rocprim::tuple<ResultType, bool>* const previous_last_value)
+    {
+        device_scan_by_key_kernel_impl<Exclusive, Config>(keys,
+                                                          values,
+                                                          output,
+                                                          get_input_value(initial_value),
+                                                          compare,
+                                                          scan_op,
+                                                          scan_state,
+                                                          size,
+                                                          starting_block,
+                                                          number_of_blocks,
+                                                          ordered_bid,
+                                                          previous_last_value);
+    }
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start)                           \
+    do                                                                                           \
+    {                                                                                            \
+        auto _error = cudaGetLastError();                                                         \
+        if(_error != cudaSuccess)                                                                 \
+            return _error;                                                                       \
+        if(debug_synchronous)                                                                    \
+        {                                                                                        \
+            std::cout << name << "(" << size << ")";                                             \
+            auto __error = cudaStreamSynchronize(stream);                                         \
+            if(__error != cudaSuccess)                                                            \
+                return __error;                                                                  \
+            auto _end = std::chrono::high_resolution_clock::now();                               \
+            auto _d   = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n';                              \
+        }                                                                                        \
+    } while(false)
+    template <bool Exclusive,
+              typename Config,
+              typename KeysInputIterator,
+              typename InputIterator,
+              typename OutputIterator,
+              typename InitValueType,
+              typename BinaryFunction,
+              typename CompareFunction>
+    inline cudaError_t scan_by_key_impl(void* const           temporary_storage,
+                                       size_t&               storage_size,
+                                       KeysInputIterator     keys,
+                                       InputIterator         input,
+                                       OutputIterator        output,
+                                       const InitValueType   initial_value,
+                                       const size_t          size,
+                                       const BinaryFunction  scan_op,
+                                       const CompareFunction compare,
+                                       const cudaStream_t     stream,
+                                       const bool            debug_synchronous)
+    {
+        using config               = Config;
+        using real_init_value_type = input_type_t<InitValueType>;
+        using wrapped_type = ::rocprim::tuple<real_init_value_type, bool>;
+        using scan_state_type            = detail::lookback_scan_state<wrapped_type>;
+        using scan_state_with_sleep_type = detail::lookback_scan_state<wrapped_type, true>;
+        using ordered_block_id_type      = detail::ordered_block_id<unsigned int>;
+        constexpr unsigned int block_size       = config::block_size;
+        constexpr unsigned int items_per_thread = config::items_per_thread;
+        constexpr auto         items_per_block  = block_size * items_per_thread;
+        static constexpr size_t size_limit = config::size_limit;
+        static constexpr size_t aligned_size_limit
+            = ::rocprim::max<size_t>(size_limit - size_limit % items_per_block, items_per_block);
+        const size_t limited_size     = std::min<size_t>(size, aligned_size_limit);
+        const bool   use_limited_size = limited_size == aligned_size_limit;
+        // Number of blocks in a single launch (or the only launch if it fits)
+        const unsigned int number_of_blocks = ceiling_div(limited_size, items_per_block);
+        // Calculate required temporary storage, this is valid even with scan_state_with_sleep_type
+        const size_t scan_state_bytes
+            = align_size(scan_state_type::get_storage_size(number_of_blocks));
+        if(temporary_storage == nullptr)
+        {
+            const size_t ordered_block_id_bytes
+                = align_size(ordered_block_id_type::get_storage_size(), alignof(wrapped_type));
+            // storage_size is never zero
+            storage_size = scan_state_bytes + ordered_block_id_bytes
+                           + (use_limited_size ? sizeof(wrapped_type) : 0);
+            return cudaSuccess;
+        }
+        if(number_of_blocks == 0u)
+        {
+            return cudaSuccess;
+        }
+        bool use_sleep;
+        if(const cudaError_t error = is_sleep_scan_state_used(use_sleep))
+        {
+            return error;
+        }
+        // Call the provided function with either scan_state or scan_state_with_sleep based on
+        // the value of use_sleep_scan_state
+        auto with_scan_state
+            = [use_sleep,
+               scan_state            = scan_state_type::create(temporary_storage, number_of_blocks),
+               scan_state_with_sleep = scan_state_with_sleep_type::create(
+                   temporary_storage, number_of_blocks)](auto&& func) mutable -> decltype(auto) {
+            if(use_sleep)
+            {
+                return func(scan_state_with_sleep);
+            }
+            else
+            {
+                return func(scan_state);
+            }
+        };
+        // Create and initialize ordered_block_id obj
+        auto* const ptr         = static_cast<char*>(temporary_storage);
+        const auto  ordered_bid = ordered_block_id_type::create(
+             reinterpret_cast<ordered_block_id_type::id_type*>(ptr + scan_state_bytes));
+        // The last element
+        auto* const previous_last_value
+            = use_limited_size
+                  ? reinterpret_cast<wrapped_type*>(ptr + storage_size - sizeof(wrapped_type))
+                  : nullptr;
+        // Total number of blocks in all launches
+        const auto   total_number_of_blocks = ceiling_div(size, items_per_block);
+        const size_t number_of_launch       = ceiling_div(size, limited_size);
+        if(debug_synchronous)
+        {
+            std::cout << "----------------------------------\n";
+            std::cout << "size:               " << size << '\n';
+            std::cout << "aligned_size_limit: " << aligned_size_limit << '\n';
+            std::cout << "use_limited_size:   " << std::boolalpha << use_limited_size << '\n';
+            std::cout << "number_of_launch:   " << number_of_launch << '\n';
+            std::cout << "block_size:         " << block_size << '\n';
+            std::cout << "items_per_block:    " << items_per_block << '\n';
+            std::cout << "----------------------------------\n";
+        }
+        for(size_t i = 0, offset = 0; i < number_of_launch; i++, offset += limited_size)
+        {
+            const size_t current_size = std::min<size_t>(size - offset, limited_size);
+            const auto   scan_blocks  = ceiling_div(current_size, items_per_block);
+            const auto init_grid_size = ceiling_div(scan_blocks, block_size);
+            // Start point for time measurements
+            std::chrono::high_resolution_clock::time_point start;
+            if(debug_synchronous)
+            {
+                std::cout << "index:            " << i << '\n';
+                std::cout << "current_size:     " << current_size << '\n';
+                std::cout << "number of blocks: " << scan_blocks << '\n';
+                start = std::chrono::high_resolution_clock::now();
+            }
+            with_scan_state([&](const auto scan_state) {
+                init_lookback_scan_state_kernel<<<
+                                   dim3(init_grid_size),
+                                   dim3(block_size),
+                                   0,
+                                   stream>>>(
+                                   scan_state,
+                                   scan_blocks,
+                                   ordered_bid,
+                                   number_of_blocks - 1,
+                                   i > 0 ? previous_last_value : nullptr);
+            });
+            ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
+                "init_lookback_scan_state_kernel", scan_blocks, start);
+            if(debug_synchronous)
+            {
+                start = std::chrono::high_resolution_clock::now();
+            }
+            with_scan_state([&](auto& scan_state) {
+                device_scan_by_key_kernel<Exclusive, config><<<
+                                   dim3(scan_blocks),
+                                   dim3(block_size),
+                                   0,
+                                   stream>>>(
+                                   keys + offset,
+                                   input + offset,
+                                   output + offset,
+                                   initial_value,
+                                   compare,
+                                   scan_op,
+                                   scan_state,
+                                   size,
+                                   i * number_of_blocks,
+                                   total_number_of_blocks,
+                                   ordered_bid,
+                                   i > 0 ? previous_last_value : nullptr);
+            });
+            ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
+                "device_scan_by_key_kernel", current_size, start);
+        }
+        return cudaSuccess;
+    }
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+}
+/// \addtogroup devicemodule
+/// @{
+/// \brief Parallel inclusive scan-by-key primitive for device level.
+///
+/// inclusive_scan_by_key function performs a device-wide inclusive prefix scan-by-key
+/// operation using binary \p scan_op operator.
+///
+/// \par Overview
+/// * Supports non-commutative scan operators. However, a scan operator should be
+/// associative. When used with non-associative functions the results may be non-deterministic
+/// and/or vary in precision.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have
+/// at least \p size elements.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
+/// a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam ValuesOutputIterator - random-access iterator type of the output range. It can be
+/// a simple pointer type.
+/// \tparam BinaryFunction - type of binary function used for scan. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+/// \tparam KeyCompareFunction - type of binary function used to determine keys equality. Default type
+/// is \p rocprim::equal_to<T>, where \p T is a \p value_type of \p KeysInputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the scan operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - iterator to the first element in the range of keys.
+/// \param [in] values_input - iterator to the first element in the range of values to scan.
+/// \param [out] values_output - iterator to the first element in the output value range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] scan_op - binary operation function object that will be used for scanning
+/// input values.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// Default is BinaryFunction().
+/// \param [in] key_compare_op - binary operation function object that will be used to determine keys equality.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// Default is KeyCompareFunction().
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level inclusive sum-by-key operation is performed on an array of
+/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s).
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t size;           // e.g., 8
+/// int *   keys_input;    // e.g., [1, 1, 2, 2, 3, 3, 3, 5]
+/// short * values_input;  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int *   values_output; // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::inclusive_scan_by_key(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, values_input,
+///     values_output, size,
+///     rocprim::plus<int>()
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform scan-by-key
+/// rocprim::inclusive_scan_by_key(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, values_input,
+///     values_output, size,
+///     rocprim::plus<int>()
+/// );
+/// // values_output: [1, 2, 3, 7, 5, 11, 18, 8]
+/// \endcode
+/// \endparblock
+template <typename Config = default_config,
+          typename KeysInputIterator,
+          typename ValuesInputIterator,
+          typename ValuesOutputIterator,
+          typename BinaryFunction
+          = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>,
+          typename KeyCompareFunction
+          = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
+inline cudaError_t inclusive_scan_by_key(void* const                temporary_storage,
+                                        size_t&                    storage_size,
+                                        const KeysInputIterator    keys_input,
+                                        const ValuesInputIterator  values_input,
+                                        const ValuesOutputIterator values_output,
+                                        const size_t               size,
+                                        const BinaryFunction       scan_op = BinaryFunction(),
+                                        const KeyCompareFunction   key_compare_op
+                                        = KeyCompareFunction(),
+                                        const cudaStream_t stream            = 0,
+                                        const bool        debug_synchronous = false)
+{
+    using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+    using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
+    // Get default config if Config is default_config
+    using config = detail::default_or_custom_config<
+        Config,
+        detail::default_scan_by_key_config<ROCPRIM_TARGET_ARCH, key_type, value_type>>;
+    return detail::scan_by_key_impl<false, config>(temporary_storage,
+                                                   storage_size,
+                                                   keys_input,
+                                                   values_input,
+                                                   values_output,
+                                                   value_type(),
+                                                   size,
+                                                   scan_op,
+                                                   key_compare_op,
+                                                   stream,
+                                                   debug_synchronous);
+}
+/// \brief Parallel exclusive scan-by-key primitive for device level.
+///
+/// inclusive_scan_by_key function performs a device-wide exclusive prefix scan-by-key
+/// operation using binary \p scan_op operator.
+///
+/// \par Overview
+/// * Supports non-commutative scan operators. However, a scan operator should be
+/// associative. When used with non-associative functions the results may be non-deterministic
+/// and/or vary in precision.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have
+/// at least \p size elements.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
+/// a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be
+/// a simple pointer type.
+/// \tparam ValuesOutputIterator - random-access iterator type of the output range. It can be
+/// a simple pointer type.
+/// \tparam InitValueType - type of the initial value.
+/// \tparam BinaryFunction - type of binary function used for scan. Default type
+/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
+/// \tparam KeyCompareFunction - type of binary function used to determine keys equality. Default type
+/// is \p rocprim::equal_to<T>, where \p T is a \p value_type of \p KeysInputIterator.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the scan operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - iterator to the first element in the range of keys.
+/// \param [in] values_input - iterator to the first element in the range of values to scan.
+/// \param [out] values_output - iterator to the first element in the output value range.
+/// \param [in] initial_value - initial value to start the scan.
+/// A rocpim::future_value may be passed to use a value that will be later computed.
+/// \param [in] size - number of element in the input range.
+/// \param [in] scan_op - binary operation function object that will be used for scanning
+/// input values.
+/// The signature of the function should be equivalent to the following:
+/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// Default is BinaryFunction().
+/// \param [in] key_compare_op - binary operation function object that will be used to determine keys equality.
+/// The signature of the function should be equivalent to the following:
+/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+/// <tt>const &</tt>, but function object must not modify the objects passed to it.
+/// Default is KeyCompareFunction().
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level inclusive sum-by-key operation is performed on an array of
+/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s).
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t size;           // e.g., 8
+/// int *   keys_input;    // e.g., [1, 1, 1, 2, 2, 3, 3, 4]
+/// short * values_input;  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+/// int start_value;       // e.g., 9
+/// int *   values_output; // empty array of 8 elements
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::exclusive_scan_by_key(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, values_input,
+///     values_output, start_value,
+///     size,rocprim::plus<int>()
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform scan-by-key
+/// rocprim::exclusive_scan_by_key(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, values_input,
+///     values_output, start_value,
+///     size,rocprim::plus<int>()
+/// );
+/// // values_output: [9, 10, 12, 9, 13, 9, 15, 9]
+/// \endcode
+/// \endparblock
+template <typename Config = default_config,
+          typename KeysInputIterator,
+          typename ValuesInputIterator,
+          typename ValuesOutputIterator,
+          typename InitialValueType,
+          typename BinaryFunction
+          = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>,
+          typename KeyCompareFunction
+          = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
+inline cudaError_t exclusive_scan_by_key(void* const                temporary_storage,
+                                        size_t&                    storage_size,
+                                        const KeysInputIterator    keys_input,
+                                        const ValuesInputIterator  values_input,
+                                        const ValuesOutputIterator values_output,
+                                        const InitialValueType     initial_value,
+                                        const size_t               size,
+                                        const BinaryFunction       scan_op = BinaryFunction(),
+                                        const KeyCompareFunction   key_compare_op
+                                        = KeyCompareFunction(),
+                                        const cudaStream_t stream            = 0,
+                                        const bool        debug_synchronous = false)
+{
+    using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+    using real_init_value_type = detail::input_type_t<InitialValueType>;
+    // Get default config if Config is default_config
+    using config = detail::default_or_custom_config<
+        Config,
+        detail::default_scan_by_key_config<ROCPRIM_TARGET_ARCH, key_type, real_init_value_type>
+    >;
+    return detail::scan_by_key_impl<true, config>(temporary_storage,
+                                                  storage_size,
+                                                  keys_input,
+                                                  values_input,
+                                                  values_output,
+                                                  initial_value,
+                                                  size,
+                                                  scan_op,
+                                                  key_compare_op,
+                                                  stream,
+                                                  debug_synchronous);
+}
+/// @}
+// end of group devicemodule
+END_ROCPRIM_NAMESPACE
+#endif // ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_
--- a/3rdparty/cub/rocprim/device/device_scan_by_key_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_scan_by_key_config.hpp
+// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_CONFIG_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "config_types.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief Configuration of device-level scan-by-key operation.
+///
+/// \tparam BlockSize - number of threads in a block.
+/// \tparam ItemsPerThread - number of items processed by each thread.
+/// \tparam UseLookback - whether to use lookback scan or reduce-then-scan algorithm.
+/// \tparam BlockLoadMethod - method for loading input values.
+/// \tparam StoreLoadMethod - method for storing values.
+/// \tparam BlockScanMethod - algorithm for block scan.
+/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch.
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    bool UseLookback,
+    ::rocprim::block_load_method BlockLoadMethod,
+    ::rocprim::block_store_method BlockStoreMethod,
+    ::rocprim::block_scan_algorithm BlockScanMethod,
+    unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT
+>
+struct scan_by_key_config
+{
+    /// \brief Number of threads in a block.
+    static constexpr unsigned int block_size = BlockSize;
+    /// \brief Number of items processed by each thread.
+    static constexpr unsigned int items_per_thread = ItemsPerThread;
+    /// \brief Whether to use lookback scan or reduce-then-scan algorithm.
+    static constexpr bool use_lookback = UseLookback;
+    /// \brief Method for loading input values.
+    static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod;
+    /// \brief Method for storing values.
+    static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod;
+    /// \brief Algorithm for block scan.
+    static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
+    /// \brief Limit on the number of items for a single scan kernel launch.
+    static constexpr unsigned int size_limit = SizeLimit;
+};
+namespace detail
+{
+template<class Key, class Value>
+struct scan_by_key_config_900
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
+    using type = scan_config<
+        limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+template<class Key, class Value>
+struct scan_by_key_config_90a
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
+    using type = scan_config<
+        limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+template<class Key, class Value>
+struct scan_by_key_config_908
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
+    using type = scan_config<
+        limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 20u / item_scale),
+        ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+// TODO: We need to update these parameters
+template<class Key, class Value>
+struct scan_by_key_config_1030
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
+    using type = scan_config<
+        limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
+        ::rocprim::max(1u, 15u / item_scale),
+        ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+template<unsigned int TargetArch, class Key, class Value>
+struct default_scan_by_key_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<900, scan_by_key_config_900<Key, Value>>,
+        select_arch_case<ROCPRIM_ARCH_90a, scan_by_key_config_90a<Key, Value>>,
+        select_arch_case<908, scan_by_key_config_908<Key, Value>>,
+        select_arch_case<1030, scan_by_key_config_1030<Key, Value>>,
+        scan_by_key_config_900<Key, Value>
+    > { };
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_scan_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_scan_config.hpp
+// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SCAN_CONFIG_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../block/block_load.hpp"
+#include "../block/block_store.hpp"
+#include "../block/block_scan.hpp"
+#include "config_types.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief Configuration of device-level scan primitives.
+///
+/// \tparam BlockSize - number of threads in a block.
+/// \tparam ItemsPerThread - number of items processed by each thread.
+/// \tparam UseLookback - whether to use lookback scan or reduce-then-scan algorithm.
+/// \tparam BlockLoadMethod - method for loading input values.
+/// \tparam StoreLoadMethod - method for storing values.
+/// \tparam BlockScanMethod - algorithm for block scan.
+/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch.
+template<
+    unsigned int BlockSize,
+    unsigned int ItemsPerThread,
+    bool UseLookback,
+    ::rocprim::block_load_method BlockLoadMethod,
+    ::rocprim::block_store_method BlockStoreMethod,
+    ::rocprim::block_scan_algorithm BlockScanMethod,
+    unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT
+>
+struct scan_config
+{
+    /// \brief Number of threads in a block.
+    static constexpr unsigned int block_size = BlockSize;
+    /// \brief Number of items processed by each thread.
+    static constexpr unsigned int items_per_thread = ItemsPerThread;
+    /// \brief Whether to use lookback scan or reduce-then-scan algorithm.
+    static constexpr bool use_lookback = UseLookback;
+    /// \brief Method for loading input values.
+    static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod;
+    /// \brief Method for storing values.
+    static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod;
+    /// \brief Algorithm for block scan.
+    static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
+    /// \brief Limit on the number of items for a single scan kernel launch.
+    static constexpr unsigned int size_limit = SizeLimit;
+};
+namespace detail
+{
+template<class Value>
+struct scan_config_803
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    using type = scan_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+template<class Value>
+struct scan_config_900
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    using type = scan_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+// TODO: We need to update these parameters
+template<class Value>
+struct scan_config_90a
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    using type = scan_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+template<class Value>
+struct scan_config_908
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    using type = scan_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 20u / item_scale),
+        ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+// TODO: We need to update these parameters
+template<class Value>
+struct scan_config_1030
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    using type = scan_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
+        ::rocprim::max(1u, 15u / item_scale),
+        ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose,
+        ::rocprim::block_scan_algorithm::using_warp_scan
+    >;
+};
+template<unsigned int TargetArch, class Value>
+struct default_scan_config
+    : select_arch<
+        TargetArch,
+        select_arch_case<803, scan_config_803<Value>>,
+        select_arch_case<900, scan_config_900<Value>>,
+        select_arch_case<ROCPRIM_ARCH_90a, scan_config_90a<Value>>,
+        select_arch_case<908, scan_config_908<Value>>,
+        select_arch_case<1030, scan_config_1030<Value>>,
+        scan_config_900<Value>
+    > { };
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_SCAN_CONFIG_HPP_
--- a/3rdparty/cub/rocprim/device/device_segmented_radix_sort.hpp
+++ b/3rdparty/cub/rocprim/device/device_segmented_radix_sort.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
+#include <iostream>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../detail/radix_sort.hpp"
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+#include "../block/block_load.hpp"
+#include "../iterator/counting_iterator.hpp"
+#include "../iterator/reverse_iterator.hpp"
+#include "detail/device_segmented_radix_sort.hpp"
+#include "device_partition.hpp"
+#include "device_segmented_radix_sort_config.hpp"
+/// \addtogroup devicemodule
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+namespace detail
+{
+template<
+    class Config,
+    bool Descending,
+    unsigned int BlockSize,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class OffsetIterator
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void segmented_sort_kernel(KeysInputIterator keys_input,
+                           typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
+                           KeysOutputIterator keys_output,
+                           ValuesInputIterator values_input,
+                           typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
+                           ValuesOutputIterator values_output,
+                           bool to_output,
+                           OffsetIterator begin_offsets,
+                           OffsetIterator end_offsets,
+                           unsigned int long_iterations,
+                           unsigned int short_iterations,
+                           unsigned int begin_bit,
+                           unsigned int end_bit)
+{
+    segmented_sort<Config, Descending>(
+        keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
+        to_output,
+        begin_offsets, end_offsets,
+        long_iterations, short_iterations,
+        begin_bit, end_bit
+    );
+}
+template<
+    class Config,
+    bool Descending,
+    unsigned int BlockSize,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class SegmentIndexIterator,
+    class OffsetIterator
+>
+ROCPRIM_KERNEL
+__launch_bounds__(BlockSize)
+void segmented_sort_large_kernel(KeysInputIterator keys_input,
+                                 typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
+                                 KeysOutputIterator keys_output,
+                                 ValuesInputIterator values_input,
+                                 typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
+                                 ValuesOutputIterator values_output,
+                                 bool to_output,
+                                 SegmentIndexIterator segment_indices,
+                                 OffsetIterator begin_offsets,
+                                 OffsetIterator end_offsets,
+                                 unsigned int long_iterations,
+                                 unsigned int short_iterations,
+                                 unsigned int begin_bit,
+                                 unsigned int end_bit)
+{
+    segmented_sort_large<Config, Descending>(
+        keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
+        to_output, segment_indices,
+        begin_offsets, end_offsets,
+        long_iterations, short_iterations,
+        begin_bit, end_bit
+    );
+}
+template<class Config,
+         bool         Descending,
+         unsigned int BlockSize,
+         class KeysInputIterator,
+         class KeysOutputIterator,
+         class ValuesInputIterator,
+         class ValuesOutputIterator,
+         class SegmentIndexIterator,
+         class OffsetIterator>
+ROCPRIM_KERNEL __launch_bounds__(BlockSize) void segmented_sort_small_or_medium_kernel(
+    KeysInputIterator                                               keys_input,
+    typename std::iterator_traits<KeysInputIterator>::value_type*   keys_tmp,
+    KeysOutputIterator                                              keys_output,
+    ValuesInputIterator                                             values_input,
+    typename std::iterator_traits<ValuesInputIterator>::value_type* values_tmp,
+    ValuesOutputIterator                                            values_output,
+    bool                                                            to_output,
+    unsigned int                                                    num_segments,
+    SegmentIndexIterator                                            segment_indices,
+    OffsetIterator                                                  begin_offsets,
+    OffsetIterator                                                  end_offsets,
+    unsigned int                                                    begin_bit,
+    unsigned int                                                    end_bit)
+{
+    segmented_sort_small<Config, Descending>(
+        keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
+        to_output, num_segments, segment_indices,
+        begin_offsets, end_offsets,
+        begin_bit, end_bit
+    );
+}
+#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
+    { \
+        auto _error = cudaGetLastError(); \
+        if(_error != cudaSuccess) return _error; \
+        if(debug_synchronous) \
+        { \
+            std::cout << name << "(" << size << ")"; \
+            auto __error = cudaStreamSynchronize(stream); \
+            if(__error != cudaSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
+        } \
+    }
+struct TwoWayPartitioner
+{
+    template<typename InputIterator,
+             typename FirstOutputIterator,
+             typename SecondOutputIterator,
+             typename UnselectedOutputIterator,
+             typename SelectedCountOutputIterator,
+             typename FirstUnaryPredicate,
+             typename SecondUnaryPredicate>
+    cudaError_t operator()(void*               temporary_storage,
+                          size_t&             storage_size,
+                          InputIterator       input,
+                          FirstOutputIterator output_first_part,
+                          SecondOutputIterator /*output_second_part*/,
+                          UnselectedOutputIterator /*output_unselected*/,
+                          SelectedCountOutputIterator selected_count_output,
+                          const size_t                size,
+                          FirstUnaryPredicate         select_first_part_op,
+                          SecondUnaryPredicate /*select_second_part_op*/,
+                          const cudaStream_t stream,
+                          const bool        debug_synchronous)
+    {
+        return partition(temporary_storage,
+                         storage_size,
+                         input,
+                         output_first_part,
+                         selected_count_output,
+                         size,
+                         select_first_part_op,
+                         stream,
+                         debug_synchronous);
+    }
+};
+struct ThreeWayPartitioner
+{
+    template<typename InputIterator,
+             typename FirstOutputIterator,
+             typename SecondOutputIterator,
+             typename UnselectedOutputIterator,
+             typename SelectedCountOutputIterator,
+             typename FirstUnaryPredicate,
+             typename SecondUnaryPredicate>
+    cudaError_t operator()(void*                       temporary_storage,
+                          size_t&                     storage_size,
+                          InputIterator               input,
+                          FirstOutputIterator         output_first_part,
+                          SecondOutputIterator        output_second_part,
+                          UnselectedOutputIterator    output_unselected,
+                          SelectedCountOutputIterator selected_count_output,
+                          const size_t                size,
+                          FirstUnaryPredicate         select_first_part_op,
+                          SecondUnaryPredicate        select_second_part_op,
+                          const cudaStream_t           stream,
+                          const bool                  debug_synchronous)
+    {
+        return partition_three_way(temporary_storage,
+                                   storage_size,
+                                   input,
+                                   output_first_part,
+                                   output_second_part,
+                                   output_unselected,
+                                   selected_count_output,
+                                   size,
+                                   select_first_part_op,
+                                   select_second_part_op,
+                                   stream,
+                                   debug_synchronous);
+    }
+};
+template<
+    class Config,
+    bool Descending,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class OffsetIterator
+>
+inline
+cudaError_t segmented_radix_sort_impl(void * temporary_storage,
+                                     size_t& storage_size,
+                                     KeysInputIterator keys_input,
+                                     typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
+                                     KeysOutputIterator keys_output,
+                                     ValuesInputIterator values_input,
+                                     typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
+                                     ValuesOutputIterator values_output,
+                                     unsigned int size,
+                                     bool& is_result_in_output,
+                                     unsigned int segments,
+                                     OffsetIterator begin_offsets,
+                                     OffsetIterator end_offsets,
+                                     unsigned int begin_bit,
+                                     unsigned int end_bit,
+                                     cudaStream_t stream,
+                                     bool debug_synchronous)
+{
+    using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
+    using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
+    using segment_index_type = unsigned int;
+    using segment_index_iterator = counting_iterator<segment_index_type>;
+    static_assert(
+        std::is_same<key_type, typename std::iterator_traits<KeysOutputIterator>::value_type>::value,
+        "KeysInputIterator and KeysOutputIterator must have the same value_type"
+    );
+    static_assert(
+        std::is_same<value_type, typename std::iterator_traits<ValuesOutputIterator>::value_type>::value,
+        "ValuesInputIterator and ValuesOutputIterator must have the same value_type"
+    );
+    using config = default_or_custom_config<
+        Config,
+        default_segmented_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
+    >;
+    static constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
+    static constexpr bool partitioning_allowed =
+        !std::is_same<typename config::warp_sort_config, DisabledWarpSortConfig>::value;
+    static constexpr unsigned int max_small_segment_length
+        = config::warp_sort_config::items_per_thread_small
+          * config::warp_sort_config::logical_warp_size_small;
+    static constexpr unsigned int small_segments_per_block
+        = config::warp_sort_config::block_size_small
+          / config::warp_sort_config::logical_warp_size_small;
+    static constexpr unsigned int max_medium_segment_length
+        = config::warp_sort_config::items_per_thread_medium
+          * config::warp_sort_config::logical_warp_size_medium;
+    static constexpr unsigned int medium_segments_per_block
+        = config::warp_sort_config::block_size_medium
+          / config::warp_sort_config::logical_warp_size_medium;
+    static_assert(
+        max_small_segment_length <= max_medium_segment_length,
+        "The max length of small segments cannot be higher than the max length of medium segments");
+    // Don't waste cycles on 3-way partitioning, if the small and medium segments are equal length
+    static constexpr bool three_way_partitioning
+        = max_small_segment_length < max_medium_segment_length;
+    using partitioner_type
+        = std::conditional_t<three_way_partitioning, ThreeWayPartitioner, TwoWayPartitioner>;
+    partitioner_type partitioner;
+    const auto large_segment_selector = [=](const unsigned int segment_index) mutable -> bool
+    {
+        const unsigned int segment_length
+            = end_offsets[segment_index] - begin_offsets[segment_index];
+        return segment_length > max_medium_segment_length;
+    };
+    const auto medium_segment_selector = [=](const unsigned int segment_index) mutable -> bool
+    {
+        const unsigned int segment_length = end_offsets[segment_index] - begin_offsets[segment_index];
+        return segment_length > max_small_segment_length;
+    };
+    const bool with_double_buffer = keys_tmp != nullptr;
+    const unsigned int bits = end_bit - begin_bit;
+    const unsigned int iterations = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits);
+    const bool to_output = with_double_buffer || (iterations - 1) % 2 == 0;
+    is_result_in_output = (iterations % 2 == 0) != to_output;
+    const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits;
+    const unsigned int short_iterations = radix_bits_diff != 0
+        ? ::rocprim::min(iterations, (config::long_radix_bits * iterations - bits) / radix_bits_diff)
+        : 0;
+    const unsigned int long_iterations = iterations - short_iterations;
+    const bool do_partitioning = partitioning_allowed
+        && segments >= config::warp_sort_config::partitioning_threshold;
+    const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type));
+    const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0;
+    const size_t large_and_small_segment_indices_bytes
+        = ::rocprim::detail::align_size(segments * sizeof(segment_index_type));
+    const size_t medium_segment_indices_bytes
+        = three_way_partitioning
+              ? ::rocprim::detail::align_size(segments * sizeof(segment_index_type))
+              : 0;
+    static constexpr size_t segment_count_output_size = three_way_partitioning ? 2 : 1;
+    const size_t            segment_count_output_bytes
+        = ::rocprim::detail::align_size(segment_count_output_size * sizeof(segment_index_type));
+    segment_index_type* large_segment_indices_output{};
+    // The total number of large and small segments is not above the number of segments
+    // The same buffer is filled with the large and small indices from both directions
+    auto small_segment_indices_output
+        = make_reverse_iterator(large_segment_indices_output + segments);
+    segment_index_type* medium_segment_indices_output{};
+    segment_index_type* segment_count_output{};
+    size_t              partition_storage_size{};
+    void*               partition_temporary_storage{};
+    if(temporary_storage == nullptr)
+    {
+        storage_size = with_double_buffer ? 0 : (keys_bytes + values_bytes);
+        if(do_partitioning)
+        {
+            storage_size += large_and_small_segment_indices_bytes;
+            storage_size += medium_segment_indices_bytes;
+            storage_size += segment_count_output_bytes;
+            const auto partition_result = partitioner(partition_temporary_storage,
+                                                      partition_storage_size,
+                                                      segment_index_iterator{},
+                                                      large_segment_indices_output,
+                                                      medium_segment_indices_output,
+                                                      small_segment_indices_output,
+                                                      segment_count_output,
+                                                      segments,
+                                                      large_segment_selector,
+                                                      medium_segment_selector,
+                                                      stream,
+                                                      debug_synchronous);
+            if(cudaSuccess != partition_result)
+            {
+                return partition_result;
+            }
+            storage_size += partition_storage_size;
+        }
+        // Make sure user won't try to allocate 0 bytes memory, otherwise
+        // user may again pass nullptr as temporary_storage
+        storage_size = storage_size == 0 ? 4 : storage_size;
+        return cudaSuccess;
+    }
+    if(segments == 0u)
+    {
+        return cudaSuccess;
+    }
+    if(debug_synchronous)
+    {
+        std::cout << "begin_bit " << begin_bit << '\n';
+        std::cout << "end_bit " << end_bit << '\n';
+        std::cout << "bits " << bits << '\n';
+        std::cout << "segments " << segments << '\n';
+        std::cout << "radix_bits_diff " << radix_bits_diff << '\n';
+        std::cout << "storage_size " << storage_size << '\n';
+        std::cout << "iterations " << iterations << '\n';
+        std::cout << "long_iterations " << long_iterations << '\n';
+        std::cout << "short_iterations " << short_iterations << '\n';
+        std::cout << "do_partitioning " << do_partitioning << '\n';
+        std::cout << "config::sort::block_size: " << config::sort::block_size << '\n';
+        std::cout << "config::sort::items_per_thread: " << config::sort::items_per_thread << '\n';
+        cudaError_t error = cudaStreamSynchronize(stream);
+        if(error != cudaSuccess) return error;
+    }
+    char* ptr = reinterpret_cast<char*>(temporary_storage);
+    if(!with_double_buffer)
+    {
+        keys_tmp = reinterpret_cast<key_type*>(ptr);
+        ptr += keys_bytes;
+        values_tmp = with_values ? reinterpret_cast<value_type*>(ptr) : nullptr;
+        ptr += values_bytes;
+    }
+    large_segment_indices_output = reinterpret_cast<segment_index_type*>(ptr);
+    ptr += large_and_small_segment_indices_bytes;
+    medium_segment_indices_output = reinterpret_cast<segment_index_type*>(ptr);
+    ptr += medium_segment_indices_bytes;
+    small_segment_indices_output = make_reverse_iterator(large_segment_indices_output + segments);
+    segment_count_output         = reinterpret_cast<segment_index_type*>(ptr);
+    ptr += segment_count_output_bytes;
+    partition_temporary_storage = ptr;
+    ptr += partition_storage_size;
+    if(do_partitioning)
+    {
+        cudaError_t result = partitioner(partition_temporary_storage,
+                                        partition_storage_size,
+                                        segment_index_iterator{},
+                                        large_segment_indices_output,
+                                        medium_segment_indices_output,
+                                        small_segment_indices_output,
+                                        segment_count_output,
+                                        segments,
+                                        large_segment_selector,
+                                        medium_segment_selector,
+                                        stream,
+                                        debug_synchronous);
+        if(cudaSuccess != result)
+        {
+            return result;
+        }
+        segment_index_type segment_counts[segment_count_output_size]{};
+        result = cudaMemcpyAsync(&segment_counts,
+                                segment_count_output,
+                                segment_count_output_bytes,
+                                cudaMemcpyDeviceToHost,
+                                stream);
+        if(cudaSuccess != result)
+        {
+            return result;
+        }
+        result = cudaStreamSynchronize(stream);
+        if(cudaSuccess != result)
+        {
+            return result;
+        }
+        const auto large_segment_count  = segment_counts[0];
+        const auto medium_segment_count = three_way_partitioning ? segment_counts[1] : 0;
+        const auto small_segment_count  = segments - large_segment_count - medium_segment_count;
+        if(debug_synchronous)
+        {
+            std::cout << "large_segment_count " << large_segment_count << '\n';
+            std::cout << "medium_segment_count " << medium_segment_count << '\n';
+            std::cout << "small_segment_count " << small_segment_count << '\n';
+        }
+        if(large_segment_count > 0)
+        {
+            std::chrono::high_resolution_clock::time_point start;
+            if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+            segmented_sort_large_kernel<config, Descending, config::sort::block_size>
+                <<<dim3(large_segment_count), dim3(config::sort::block_size), 0, stream>>>(
+                keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
+                to_output, large_segment_indices_output,
+                begin_offsets, end_offsets,
+                long_iterations, short_iterations,
+                begin_bit, end_bit
+            );
+            ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort:large_segments",
+                                                        large_segment_count,
+                                                        start)
+        }
+        if(three_way_partitioning && medium_segment_count > 0)
+        {
+            const auto medium_segment_grid_size
+                = ::rocprim::detail::ceiling_div(medium_segment_count, medium_segments_per_block);
+            std::chrono::high_resolution_clock::time_point start;
+            if(debug_synchronous)
+                start = std::chrono::high_resolution_clock::now();
+                    segmented_sort_small_or_medium_kernel<
+                        select_warp_sort_helper_config_medium_t<typename config::warp_sort_config>,
+                        Descending,
+                        config::warp_sort_config::block_size_medium>
+                <<<dim3(medium_segment_grid_size),
+                dim3(config::warp_sort_config::block_size_medium),
+                0,
+                stream>>>(
+                keys_input,
+                keys_tmp,
+                keys_output,
+                values_input,
+                values_tmp,
+                values_output,
+                is_result_in_output,
+                medium_segment_count,
+                medium_segment_indices_output,
+                begin_offsets,
+                end_offsets,
+                begin_bit,
+                end_bit);
+            ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort:medium_segments",
+                                                        medium_segment_count,
+                                                        start)
+        }
+        if(small_segment_count > 0)
+        {
+            const auto small_segment_grid_size = ::rocprim::detail::ceiling_div(small_segment_count,
+                                                                                small_segments_per_block);
+            std::chrono::high_resolution_clock::time_point start;
+            if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+                    segmented_sort_small_or_medium_kernel<
+                        select_warp_sort_helper_config_small_t<typename config::warp_sort_config>,
+                        Descending,
+                        config::warp_sort_config::block_size_small>
+                <<<dim3(small_segment_grid_size),
+                dim3(config::warp_sort_config::block_size_small),
+                0,
+                stream>>>(
+                keys_input,
+                keys_tmp,
+                keys_output,
+                values_input,
+                values_tmp,
+                values_output,
+                is_result_in_output,
+                small_segment_count,
+                small_segment_indices_output,
+                begin_offsets,
+                end_offsets,
+                begin_bit,
+                end_bit);
+            ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort:small_segments",
+                                                        small_segment_count,
+                                                        start)
+        }
+    }
+    else
+    {
+        std::chrono::high_resolution_clock::time_point start;
+        if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
+        segmented_sort_kernel<config, Descending, config::sort::block_size>
+            <<<dim3(segments), dim3(config::sort::block_size), 0, stream>>>(
+            keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
+            to_output,
+            begin_offsets, end_offsets,
+            long_iterations, short_iterations,
+            begin_bit, end_bit
+        );
+        ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort", segments, start)
+    }
+    return cudaSuccess;
+}
+#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+} // end namespace detail
+/// \brief Parallel ascending radix sort primitive for device level.
+///
+/// \p segmented_radix_sort_keys function performs a device-wide radix sort across multiple,
+/// non-overlapping sequences of keys. Function sorts input keys in ascending order.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
+/// an arithmetic type (that is, an integral type or a floating-point type).
+/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be
+/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending radix sort is performed on an array of
+/// \p float values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;      // e.g., 8
+/// float * input;          // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
+/// float * output;         // empty array of 8 elements
+/// unsigned int segments;  // e.g., 3
+/// int * offsets;          // e.g. [0, 2, 3, 8]
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_radix_sort_keys(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size,
+///     segments, offsets, offsets + 1
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::segmented_radix_sort_keys(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size,
+///     segments, offsets, offsets + 1
+/// );
+/// // keys_output: [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class OffsetIterator,
+    class Key = typename std::iterator_traits<KeysInputIterator>::value_type
+>
+inline
+cudaError_t segmented_radix_sort_keys(void * temporary_storage,
+                                     size_t& storage_size,
+                                     KeysInputIterator keys_input,
+                                     KeysOutputIterator keys_output,
+                                     unsigned int size,
+                                     unsigned int segments,
+                                     OffsetIterator begin_offsets,
+                                     OffsetIterator end_offsets,
+                                     unsigned int begin_bit = 0,
+                                     unsigned int end_bit = 8 * sizeof(Key),
+                                     cudaStream_t stream = 0,
+                                     bool debug_synchronous = false)
+{
+    empty_type * values = nullptr;
+    bool ignored;
+    return detail::segmented_radix_sort_impl<Config, false>(
+        temporary_storage, storage_size,
+        keys_input, nullptr, keys_output,
+        values, nullptr, values,
+        size, ignored,
+        segments, begin_offsets, end_offsets,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel descending radix sort primitive for device level.
+///
+/// \p segmented_radix_sort_keys_desc function performs a device-wide radix sort across multiple,
+/// non-overlapping sequences of keys. Function sorts input keys in descending order.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
+/// an arithmetic type (that is, an integral type or a floating-point type).
+/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be
+/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level descending radix sort is performed on an array of
+/// integer values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;      // e.g., 8
+/// int * input;            // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
+/// int * output;           // empty array of 8 elements
+/// unsigned int segments;  // e.g., 3
+/// int * offsets;          // e.g. [0, 2, 3, 8]
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_radix_sort_keys_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size,
+///     segments, offsets, offsets + 1
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::segmented_radix_sort_keys_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     input, output, input_size,
+///     segments, offsets, offsets + 1
+/// );
+/// // keys_output: [6, 3, 5, 8, 7, 4, 2, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class OffsetIterator,
+    class Key = typename std::iterator_traits<KeysInputIterator>::value_type
+>
+inline
+cudaError_t segmented_radix_sort_keys_desc(void * temporary_storage,
+                                          size_t& storage_size,
+                                          KeysInputIterator keys_input,
+                                          KeysOutputIterator keys_output,
+                                          unsigned int size,
+                                          unsigned int segments,
+                                          OffsetIterator begin_offsets,
+                                          OffsetIterator end_offsets,
+                                          unsigned int begin_bit = 0,
+                                          unsigned int end_bit = 8 * sizeof(Key),
+                                          cudaStream_t stream = 0,
+                                          bool debug_synchronous = false)
+{
+    empty_type * values = nullptr;
+    bool ignored;
+    return detail::segmented_radix_sort_impl<Config, true>(
+        temporary_storage, storage_size,
+        keys_input, nullptr, keys_output,
+        values, nullptr, values,
+        size, ignored,
+        segments, begin_offsets, end_offsets,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel ascending radix sort-by-key primitive for device level.
+///
+/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
+/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in ascending order of keys.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
+/// an arithmetic type (that is, an integral type or a floating-point type).
+/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
+/// have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be
+/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] values_input - pointer to the first element in the range to sort.
+/// \param [out] values_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending radix sort is performed where input keys are
+/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;          // e.g., 8
+/// unsigned int * keys_input;  // e.g., [ 6, 3,  5, 4,  1,  8,  1, 7]
+/// double * values_input;      // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
+/// unsigned int * keys_output; // empty array of 8 elements
+/// double * values_output;     // empty array of 8 elements
+/// unsigned int segments;      // e.g., 3
+/// int * offsets;              // e.g. [0, 2, 3, 8]
+///
+/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
+/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
+/// // is set to 5.
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_radix_sort_pairs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output, input_size,
+///     segments, offsets, offsets + 1,
+///     0, 5
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::segmented_radix_sort_pairs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output, input_size,
+///     segments, offsets, offsets + 1,
+///     0, 5
+/// );
+/// // keys_output:   [3,  6,  5,  1,  1, 4, 7,  8]
+/// // values_output: [2, -5, -4, -1, -2, 3, 7, -8]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class OffsetIterator,
+    class Key = typename std::iterator_traits<KeysInputIterator>::value_type
+>
+inline
+cudaError_t segmented_radix_sort_pairs(void * temporary_storage,
+                                      size_t& storage_size,
+                                      KeysInputIterator keys_input,
+                                      KeysOutputIterator keys_output,
+                                      ValuesInputIterator values_input,
+                                      ValuesOutputIterator values_output,
+                                      unsigned int size,
+                                      unsigned int segments,
+                                      OffsetIterator begin_offsets,
+                                      OffsetIterator end_offsets,
+                                      unsigned int begin_bit = 0,
+                                      unsigned int end_bit = 8 * sizeof(Key),
+                                      cudaStream_t stream = 0,
+                                      bool debug_synchronous = false)
+{
+    bool ignored;
+    return detail::segmented_radix_sort_impl<Config, false>(
+        temporary_storage, storage_size,
+        keys_input, nullptr, keys_output,
+        values_input, nullptr, values_output,
+        size, ignored,
+        segments, begin_offsets, end_offsets,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel descending radix sort-by-key primitive for device level.
+///
+/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
+/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in descending order of keys.
+///
+/// \par Overview
+/// * The contents of the inputs are not altered by the sorting function.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
+/// an arithmetic type (that is, an integral type or a floating-point type).
+/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
+/// have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be
+/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
+/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
+/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in] keys_input - pointer to the first element in the range to sort.
+/// \param [out] keys_output - pointer to the first element in the output range.
+/// \param [in] values_input - pointer to the first element in the range to sort.
+/// \param [out] values_output - pointer to the first element in the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level descending radix sort is performed where input keys are
+/// represented by an array of integers and input values by an array of <tt>double</tt>s.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.)
+/// size_t input_size;       // e.g., 8
+/// int * keys_input;        // e.g., [ 6, 3,  5, 4,  1,  8,  1, 7]
+/// double * values_input;   // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
+/// int * keys_output;       // empty array of 8 elements
+/// double * values_output;  // empty array of 8 elements
+/// unsigned int segments;   // e.g., 3
+/// int * offsets;           // e.g. [0, 2, 3, 8]
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_radix_sort_pairs_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output,
+///     input_size,
+///     segments, offsets, offsets + 1
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::segmented_radix_sort_pairs_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys_input, keys_output, values_input, values_output,
+///     input_size,
+///     segments, offsets, offsets + 1
+/// );
+/// // keys_output:   [ 6, 3,  5,  8, 7, 4,  1,  1]
+/// // values_output: [-5, 2, -4, -8, 7, 3, -1, -2]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class KeysInputIterator,
+    class KeysOutputIterator,
+    class ValuesInputIterator,
+    class ValuesOutputIterator,
+    class OffsetIterator,
+    class Key = typename std::iterator_traits<KeysInputIterator>::value_type
+>
+inline
+cudaError_t segmented_radix_sort_pairs_desc(void * temporary_storage,
+                                           size_t& storage_size,
+                                           KeysInputIterator keys_input,
+                                           KeysOutputIterator keys_output,
+                                           ValuesInputIterator values_input,
+                                           ValuesOutputIterator values_output,
+                                           unsigned int size,
+                                           unsigned int segments,
+                                           OffsetIterator begin_offsets,
+                                           OffsetIterator end_offsets,
+                                           unsigned int begin_bit = 0,
+                                           unsigned int end_bit = 8 * sizeof(Key),
+                                           cudaStream_t stream = 0,
+                                           bool debug_synchronous = false)
+{
+    bool ignored;
+    return detail::segmented_radix_sort_impl<Config, true>(
+        temporary_storage, storage_size,
+        keys_input, nullptr, keys_output,
+        values_input, nullptr, values_output,
+        size, ignored,
+        segments, begin_offsets, end_offsets,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+}
+/// \brief Parallel ascending radix sort primitive for device level.
+///
+/// \p segmented_radix_sort_keys function performs a device-wide radix sort across multiple,
+/// non-overlapping sequences of keys. Function sorts input keys in ascending order.
+///
+/// \par Overview
+/// * The contents of both buffers of \p keys may be altered by the sorting function.
+/// * \p current() of \p keys is used as the input.
+/// * The function will update \p current() of \p keys to point to the buffer
+/// that contains the output range.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * The function requires small \p temporary_storage as it does not need
+/// a temporary buffer of \p size elements.
+/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
+/// type).
+/// * Buffers of \p keys must have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be
+/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Key - key type. Must be an integral type or a floating-point type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending radix sort is performed on an array of
+/// \p float values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
+/// size_t input_size;       // e.g., 8
+/// float * input;           // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
+/// float * tmp;             // empty array of 8 elements
+/// unsigned int segments;   // e.g., 3
+/// int * offsets;           // e.g. [0, 2, 3, 8]
+/// // Create double-buffer
+/// rocprim::double_buffer<float> keys(input, tmp);
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_radix_sort_keys(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, input_size,
+///     segments, offsets, offsets + 1
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::segmented_radix_sort_keys(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, input_size,
+///     segments, offsets, offsets + 1
+/// );
+/// // keys.current(): [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class Key,
+    class OffsetIterator
+>
+inline
+cudaError_t segmented_radix_sort_keys(void * temporary_storage,
+                                     size_t& storage_size,
+                                     double_buffer<Key>& keys,
+                                     unsigned int size,
+                                     unsigned int segments,
+                                     OffsetIterator begin_offsets,
+                                     OffsetIterator end_offsets,
+                                     unsigned int begin_bit = 0,
+                                     unsigned int end_bit = 8 * sizeof(Key),
+                                     cudaStream_t stream = 0,
+                                     bool debug_synchronous = false)
+{
+    empty_type * values = nullptr;
+    bool is_result_in_output;
+    cudaError_t error = detail::segmented_radix_sort_impl<Config, false>(
+        temporary_storage, storage_size,
+        keys.current(), keys.current(), keys.alternate(),
+        values, values, values,
+        size, is_result_in_output,
+        segments, begin_offsets, end_offsets,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(temporary_storage != nullptr && is_result_in_output)
+    {
+        keys.swap();
+    }
+    return error;
+}
+/// \brief Parallel descending radix sort primitive for device level.
+///
+/// \p segmented_radix_sort_keys_desc function performs a device-wide radix sort across multiple,
+/// non-overlapping sequences of keys. Function sorts input keys in descending order.
+///
+/// \par Overview
+/// * The contents of both buffers of \p keys may be altered by the sorting function.
+/// * \p current() of \p keys is used as the input.
+/// * The function will update \p current() of \p keys to point to the buffer
+/// that contains the output range.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * The function requires small \p temporary_storage as it does not need
+/// a temporary buffer of \p size elements.
+/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
+/// type).
+/// * Buffers of \p keys must have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be
+/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Key - key type. Must be an integral type or a floating-point type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level descending radix sort is performed on an array of
+/// integer values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
+/// size_t input_size;       // e.g., 8
+/// int * input;             // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
+/// int * tmp;               // empty array of 8 elements
+/// unsigned int segments;   // e.g., 3
+/// int * offsets;           // e.g. [0, 2, 3, 8]
+/// // Create double-buffer
+/// rocprim::double_buffer<int> keys(input, tmp);
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_radix_sort_keys_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, input_size,
+///     segments, offsets, offsets + 1
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::segmented_radix_sort_keys_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, input_size,
+///     segments, offsets, offsets + 1
+/// );
+/// // keys.current(): [6, 3, 5, 8, 7, 4, 2, 1]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class Key,
+    class OffsetIterator
+>
+inline
+cudaError_t segmented_radix_sort_keys_desc(void * temporary_storage,
+                                          size_t& storage_size,
+                                          double_buffer<Key>& keys,
+                                          unsigned int size,
+                                          unsigned int segments,
+                                          OffsetIterator begin_offsets,
+                                          OffsetIterator end_offsets,
+                                          unsigned int begin_bit = 0,
+                                          unsigned int end_bit = 8 * sizeof(Key),
+                                          cudaStream_t stream = 0,
+                                          bool debug_synchronous = false)
+{
+    empty_type * values = nullptr;
+    bool is_result_in_output;
+    cudaError_t error = detail::segmented_radix_sort_impl<Config, true>(
+        temporary_storage, storage_size,
+        keys.current(), keys.current(), keys.alternate(),
+        values, values, values,
+        size, is_result_in_output,
+        segments, begin_offsets, end_offsets,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(temporary_storage != nullptr && is_result_in_output)
+    {
+        keys.swap();
+    }
+    return error;
+}
+/// \brief Parallel ascending radix sort-by-key primitive for device level.
+///
+/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
+/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in ascending order of keys.
+///
+/// \par Overview
+/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
+/// * \p current() of \p keys and \p values are used as the input.
+/// * The function will update \p current() of \p keys and \p values to point to buffers
+/// that contains the output range.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * The function requires small \p temporary_storage as it does not need
+/// a temporary buffer of \p size elements.
+/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
+/// type).
+/// * Buffers of \p keys must have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be
+/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Key - key type. Must be an integral type or a floating-point type.
+/// \tparam Value - value type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in,out] values - reference to the double-buffer of values, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level ascending radix sort is performed where input keys are
+/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
+/// size_t input_size;          // e.g., 8
+/// unsigned int * keys_input;  // e.g., [ 6, 3,  5, 4,  1,  8,  1, 7]
+/// double * values_input;      // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
+/// unsigned int * keys_tmp;    // empty array of 8 elements
+/// double*  values_tmp;        // empty array of 8 elements
+/// unsigned int segments;      // e.g., 3
+/// int * offsets;              // e.g. [0, 2, 3, 8]
+/// // Create double-buffers
+/// rocprim::double_buffer<unsigned int> keys(keys_input, keys_tmp);
+/// rocprim::double_buffer<double> values(values_input, values_tmp);
+///
+/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
+/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
+/// // is set to 5.
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_radix_sort_pairs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, values, input_size,
+///     segments, offsets, offsets + 1
+///     0, 5
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::segmented_radix_sort_pairs(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, values, input_size,
+///     segments, offsets, offsets + 1
+///     0, 5
+/// );
+/// // keys.current():   [3,  6,  5,  1,  1, 4, 7,  8]
+/// // values.current(): [2, -5, -4, -1, -2, 3, 7, -8]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class Key,
+    class Value,
+    class OffsetIterator
+>
+inline
+cudaError_t segmented_radix_sort_pairs(void * temporary_storage,
+                                      size_t& storage_size,
+                                      double_buffer<Key>& keys,
+                                      double_buffer<Value>& values,
+                                      unsigned int size,
+                                      unsigned int segments,
+                                      OffsetIterator begin_offsets,
+                                      OffsetIterator end_offsets,
+                                      unsigned int begin_bit = 0,
+                                      unsigned int end_bit = 8 * sizeof(Key),
+                                      cudaStream_t stream = 0,
+                                      bool debug_synchronous = false)
+{
+    bool is_result_in_output;
+    cudaError_t error = detail::segmented_radix_sort_impl<Config, false>(
+        temporary_storage, storage_size,
+        keys.current(), keys.current(), keys.alternate(),
+        values.current(), values.current(), values.alternate(),
+        size, is_result_in_output,
+        segments, begin_offsets, end_offsets,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(temporary_storage != nullptr && is_result_in_output)
+    {
+        keys.swap();
+        values.swap();
+    }
+    return error;
+}
+/// \brief Parallel descending radix sort-by-key primitive for device level.
+///
+/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
+/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in descending order of keys.
+///
+/// \par Overview
+/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
+/// * \p current() of \p keys and \p values are used as the input.
+/// * The function will update \p current() of \p keys and \p values to point to buffers
+/// that contains the output range.
+/// * Returns the required size of \p temporary_storage in \p storage_size
+/// if \p temporary_storage in a null pointer.
+/// * The function requires small \p temporary_storage as it does not need
+/// a temporary buffer of \p size elements.
+/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
+/// type).
+/// * Buffers of \p keys must have at least \p size elements.
+/// * Ranges specified by \p begin_offsets and \p end_offsets must have
+/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
+/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
+/// <tt>offsets + 1</tt> for \p end_offsets.
+/// * If \p Key is an integer type and the range of keys is known in advance, the performance
+/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \tparam Config - [optional] configuration of the primitive. It can be
+/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Key - key type. Must be an integral type or a floating-point type.
+/// \tparam Value - value type.
+/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
+/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///
+/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
+/// a null pointer is passed, the required allocation size (in bytes) is written to
+/// \p storage_size and function returns without performing the sort operation.
+/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
+/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in,out] values - reference to the double-buffer of values, its \p current()
+/// contains the input range and will be updated to point to the output range.
+/// \param [in] size - number of element in the input range.
+/// \param [in] segments - number of segments in the input range.
+/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
+/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
+/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+/// Non-default value not supported for floating-point key-types.
+/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
+/// launch is forced in order to check for errors. Default value is \p false.
+///
+/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
+/// type \p cudaError_t.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level descending radix sort is performed where input keys are
+/// represented by an array of integers and input values by an array of <tt>double</tt>s.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
+/// size_t input_size;       // e.g., 8
+/// int * keys_input;        // e.g., [ 6, 3,  5, 4,  1,  8,  1, 7]
+/// double * values_input;   // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
+/// int * keys_tmp;          // empty array of 8 elements
+/// double * values_tmp;     // empty array of 8 elements
+/// unsigned int segments;   // e.g., 3
+/// int * offsets;           // e.g. [0, 2, 3, 8]
+/// // Create double-buffers
+/// rocprim::double_buffer<int> keys(keys_input, keys_tmp);
+/// rocprim::double_buffer<double> values(values_input, values_tmp);
+///
+/// size_t temporary_storage_size_bytes;
+/// void * temporary_storage_ptr = nullptr;
+/// // Get required size of the temporary storage
+/// rocprim::segmented_radix_sort_pairs_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, values, input_size,
+///     segments, offsets, offsets + 1
+/// );
+///
+/// // allocate temporary storage
+/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
+///
+/// // perform sort
+/// rocprim::segmented_radix_sort_pairs_desc(
+///     temporary_storage_ptr, temporary_storage_size_bytes,
+///     keys, values, input_size,
+///     segments, offsets, offsets + 1
+/// );
+/// // keys.current():   [ 6, 3,  5,  8, 7, 4,  1,  1]
+/// // values.current(): [-5, 2, -4, -8, 7, 3, -1, -2]
+/// \endcode
+/// \endparblock
+template<
+    class Config = default_config,
+    class Key,
+    class Value,
+    class OffsetIterator
+>
+inline
+cudaError_t segmented_radix_sort_pairs_desc(void * temporary_storage,
+                                           size_t& storage_size,
+                                           double_buffer<Key>& keys,
+                                           double_buffer<Value>& values,
+                                           unsigned int size,
+                                           unsigned int segments,
+                                           OffsetIterator begin_offsets,
+                                           OffsetIterator end_offsets,
+                                           unsigned int begin_bit = 0,
+                                           unsigned int end_bit = 8 * sizeof(Key),
+                                           cudaStream_t stream = 0,
+                                           bool debug_synchronous = false)
+{
+    bool is_result_in_output;
+    cudaError_t error = detail::segmented_radix_sort_impl<Config, true>(
+        temporary_storage, storage_size,
+        keys.current(), keys.current(), keys.alternate(),
+        values.current(), values.current(), values.alternate(),
+        size, is_result_in_output,
+        segments, begin_offsets, end_offsets,
+        begin_bit, end_bit,
+        stream, debug_synchronous
+    );
+    if(temporary_storage != nullptr && is_result_in_output)
+    {
+        keys.swap();
+        values.swap();
+    }
+    return error;
+}
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group devicemodule
+#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
--- a/3rdparty/cub/rocprim/device/device_segmented_radix_sort_config.hpp
+++ b/3rdparty/cub/rocprim/device/device_segmented_radix_sort_config.hpp
+// Copyright (c) 2018-2020 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_CONFIG_HPP_
+#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_CONFIG_HPP_
+#include <algorithm>
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "config_types.hpp"
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief Configuration of the warp sort part of the device segmented radix sort operation.
+/// Short enough segments are processed on warp level.
+///
+/// \tparam LogicalWarpSizeSmall - number of threads in the logical warp of the kernel
+/// that processes small segments.
+/// \tparam ItemsPerThreadSmall - number of items processed by a thread in the kernel that processes
+/// small segments.
+/// \tparam BlockSizeSmall - number of threads per block in the kernel which processes the small segments.
+/// \tparam PartitioningThreshold - if the number of segments is at least this threshold, the
+/// segments are partitioned to a small, a medium and a large segment collection. Both collections
+/// are sorted by different kernels. Otherwise, all segments are sorted by a single kernel.
+/// \tparam EnableUnpartitionedWarpSort - If set to \p true, warp sort can be used to sort
+/// the small segments, even if the total number of segments is below \p PartitioningThreshold.
+/// \tparam LogicalWarpSizeMedium - number of threads in the logical warp of the kernel
+/// that processes medium segments.
+/// \tparam ItemsPerThreadMedium - number of items processed by a thread in the kernel that processes
+/// medium segments.
+/// \tparam BlockSizeMedium - number of threads per block in the kernel which processes the medium segments.
+template<unsigned int LogicalWarpSizeSmall,
+         unsigned int ItemsPerThreadSmall,
+         unsigned int BlockSizeSmall              = 256,
+         unsigned int PartitioningThreshold       = 3000,
+         bool         EnableUnpartitionedWarpSort = true,
+         unsigned int LogicalWarpSizeMedium       = std::max(32u, LogicalWarpSizeSmall),
+         unsigned int ItemsPerThreadMedium        = std::max(4u, ItemsPerThreadSmall),
+         unsigned int BlockSizeMedium             = 256>
+struct WarpSortConfig
+{
+    static_assert(LogicalWarpSizeSmall * ItemsPerThreadSmall
+                      <= LogicalWarpSizeMedium * ItemsPerThreadMedium,
+                  "The number of items processed by a small warp cannot be larger than the number "
+                  "of items processed by a medium warp");
+    /// \brief The number of threads in the logical warp in the small segment processing kernel.
+    static constexpr unsigned int logical_warp_size_small = LogicalWarpSizeSmall;
+    /// \brief The number of items processed by a thread in the small segment processing kernel.
+    static constexpr unsigned int items_per_thread_small = ItemsPerThreadSmall;
+    /// \brief The number of threads per block in the small segment processing kernel.
+    static constexpr unsigned int block_size_small = BlockSizeSmall;
+    /// \brief If the number of segments is at least \p partitioning_threshold, then the segments are partitioned into
+    /// small and large segment groups, and each group is handled by a different, specialized kernel.
+    static constexpr unsigned int partitioning_threshold = PartitioningThreshold;
+    /// \brief If set to \p true, warp sort can be used to sort the small segments, even if the total number of
+    /// segments is below \p PartitioningThreshold.
+    static constexpr bool enable_unpartitioned_warp_sort = EnableUnpartitionedWarpSort;
+    /// \brief The number of threads in the logical warp in the medium segment processing kernel.
+    static constexpr unsigned int logical_warp_size_medium = LogicalWarpSizeMedium;
+    /// \brief The number of items processed by a thread in the medium segment processing kernel.
+    static constexpr unsigned int items_per_thread_medium = ItemsPerThreadMedium;
+    /// \brief The number of threads per block in the medium segment processing kernel.
+    static constexpr unsigned int block_size_medium = BlockSizeMedium;
+};
+/// \brief Indicates if the warp level sorting is disabled in the
+/// device segmented radix sort configuration.
+struct DisabledWarpSortConfig
+{
+    /// \brief The number of threads in the logical warp in the small segment processing kernel.
+    static constexpr unsigned int logical_warp_size_small = 1;
+    /// \brief The number of items processed by a thread in the small segment processing kernel.
+    static constexpr unsigned int items_per_thread_small = 1;
+    /// \brief The number of threads per block in the small segment processing kernel.
+    static constexpr unsigned int block_size_small = 1;
+    /// \brief If the number of segments is at least \p partitioning_threshold, then the segments are partitioned into
+    /// small and large segment groups, and each group is handled by a different, specialized kernel.
+    static constexpr unsigned int partitioning_threshold = 0;
+    /// \brief If set to \p true, warp sort can be used to sort the small segments, even if the total number of
+    /// segments is below \p PartitioningThreshold.
+    static constexpr bool enable_unpartitioned_warp_sort = false;
+    /// \brief The number of threads in the logical warp in the medium segment processing kernel.
+    static constexpr unsigned int logical_warp_size_medium = 1;
+    /// \brief The number of items processed by a thread in the medium segment processing kernel.
+    static constexpr unsigned int items_per_thread_medium = 1;
+    /// \brief The number of threads per block in the medium segment processing kernel.
+    static constexpr unsigned int block_size_medium = 1;
+};
+/// \brief Selects the appropriate \p WarpSortConfig based on the size of the key type.
+///
+/// \tparam Key - the type of the sorted keys.
+/// \tparam MediumWarpSize - the logical warp size of the medium segment processing kernel.
+template<class Key, unsigned int MediumWarpSize = ROCPRIM_WARP_SIZE_32>
+using select_warp_sort_config_t
+    = std::conditional_t<sizeof(Key) < 2,
+                         DisabledWarpSortConfig,
+                         WarpSortConfig<32, //< logical warp size - small kernel
+                                        4, //< items per thread - small kernel
+                                        256, //< block size - small kernel
+                                        3000, //< partitioning threshold
+                                        (sizeof(Key) > 2), //< enable unpartitioned warp sort
+                                        MediumWarpSize, //< logical warp size - medium kernel
+                                        4, //< items per thread - medium kernel
+                                        256 //< block size - medium kernel
+                                        >>;
+/// \brief Configuration of device-level segmented radix sort operation.
+///
+/// Radix sort is excecuted in a few iterations (passes) depending on total number of bits to be sorted
+/// (\p begin_bit and \p end_bit), each iteration sorts either \p LongRadixBits or \p ShortRadixBits bits
+/// choosen to cover whole bit range in optimal way.
+///
+/// For example, if \p LongRadixBits is 7, \p ShortRadixBits is 6, \p begin_bit is 0 and \p end_bit is 32
+/// there will be 5 iterations: 7 + 7 + 6 + 6 + 6 = 32 bits.
+///
+/// If a segment's element count is low ( <= warp_sort_config::items_per_thread * warp_sort_config::logical_warp_size ),
+/// it is sorted by a special warp-level sorting method.
+///
+/// \tparam LongRadixBits - number of bits in long iterations.
+/// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits.
+/// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config.
+/// \tparam WarpSortConfig - configuration of the warp sort that is used on the short segments.
+template<
+    unsigned int LongRadixBits,
+    unsigned int ShortRadixBits,
+    class SortConfig,
+    class WarpSortConfig = DisabledWarpSortConfig
+>
+struct segmented_radix_sort_config
+{
+    /// \brief Number of bits in long iterations.
+    static constexpr unsigned int long_radix_bits = LongRadixBits;
+    /// \brief Number of bits in short iterations
+    static constexpr unsigned int short_radix_bits = ShortRadixBits;
+    /// \brief Configuration of radix sort kernel.
+    using sort = SortConfig;
+    /// \brief Configuration of the warp sort method.
+    using warp_sort_config = WarpSortConfig;
+};
+namespace detail
+{
+template<class Key, class Value>
+struct segmented_radix_sort_config_803
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<8, 7, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<8, 7, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<7, 6, kernel_config<256, 13>, select_warp_sort_config_t<Key> >
+        >,
+        segmented_radix_sort_config<7, 6, kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>, select_warp_sort_config_t<Key> >
+    >;
+};
+template<class Key>
+struct segmented_radix_sort_config_803<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, segmented_radix_sort_config<8, 7, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
+        select_type_case<sizeof(Key) == 2, segmented_radix_sort_config<8, 7, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
+        select_type_case<sizeof(Key) == 4, segmented_radix_sort_config<7, 6, kernel_config<256, 9>, select_warp_sort_config_t<Key> > >,
+        select_type_case<sizeof(Key) == 8, segmented_radix_sort_config<7, 6, kernel_config<256, 7>, select_warp_sort_config_t<Key> > >
+    > { };
+template<class Key, class Value>
+struct segmented_radix_sort_config_900
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<4, 4, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<6, 5, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
+        >,
+        segmented_radix_sort_config<7, 6, kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>, select_warp_sort_config_t<Key> >
+    >;
+};
+template<class Key>
+struct segmented_radix_sort_config_900<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, segmented_radix_sort_config<4, 3, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
+        select_type_case<sizeof(Key) == 2, segmented_radix_sort_config<6, 5, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
+        select_type_case<sizeof(Key) == 4, segmented_radix_sort_config<7, 6, kernel_config<256, 17>, select_warp_sort_config_t<Key> > >,
+        select_type_case<sizeof(Key) == 8, segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> > >
+    > { };
+template<class Key, class Value>
+struct segmented_radix_sort_config_90a
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<4,
+                                        4,
+                                        kernel_config<256, 10>,
+                                        select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<6,
+                                        5,
+                                        kernel_config<256, 10>,
+                                        select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<7,
+                                        6,
+                                        kernel_config<256, 15>,
+                                        select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<7,
+                                        6,
+                                        kernel_config<256, 15>,
+                                        select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
+        segmented_radix_sort_config<7,
+                                    6,
+                                    kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>,
+                                    select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>;
+};
+template<class Key>
+struct segmented_radix_sort_config_90a<Key, empty_type>
+    : select_type<
+          select_type_case<
+              sizeof(Key) == 1,
+              segmented_radix_sort_config<4,
+                                          3,
+                                          kernel_config<256, 10>,
+                                          select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
+          select_type_case<
+              sizeof(Key) == 2,
+              segmented_radix_sort_config<6,
+                                          5,
+                                          kernel_config<256, 10>,
+                                          select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
+          select_type_case<
+              sizeof(Key) == 4,
+              segmented_radix_sort_config<7,
+                                          6,
+                                          kernel_config<256, 17>,
+                                          select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
+          select_type_case<
+              sizeof(Key) == 8,
+              segmented_radix_sort_config<7,
+                                          6,
+                                          kernel_config<256, 15>,
+                                          select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>>
+{};
+template<class Key, class Value>
+struct segmented_radix_sort_config_1030
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<4, 4, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<6, 5, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
+        >,
+        segmented_radix_sort_config<7, 6, kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>, select_warp_sort_config_t<Key> >
+    >;
+};
+template<class Key>
+struct segmented_radix_sort_config_1030<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, segmented_radix_sort_config<4, 3, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
+        select_type_case<sizeof(Key) == 2, segmented_radix_sort_config<6, 5, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
+        select_type_case<sizeof(Key) == 4, segmented_radix_sort_config<7, 6, kernel_config<256, 17>, select_warp_sort_config_t<Key> > >,
+        select_type_case<sizeof(Key) == 8, segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> > >
+    > { };
+template<unsigned int TargetArch, class Key, class Value>
+struct default_segmented_radix_sort_config
+    : select_arch<
+          TargetArch,
+          select_arch_case<803, detail::segmented_radix_sort_config_803<Key, Value>>,
+          select_arch_case<900, detail::segmented_radix_sort_config_900<Key, Value>>,
+          select_arch_case<906, detail::segmented_radix_sort_config_90a<Key, Value>>,
+          select_arch_case<908, detail::segmented_radix_sort_config_90a<Key, Value>>,
+          select_arch_case<ROCPRIM_ARCH_90a, detail::segmented_radix_sort_config_90a<Key, Value>>,
+          select_arch_case<1030, detail::segmented_radix_sort_config_1030<Key, Value>>,
+          detail::segmented_radix_sort_config_900<Key, Value>>
+{};
+} // end namespace detail
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group primitivesmodule_deviceconfigs
+#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_CONFIG_HPP_