initial llama

0211193c · zhuwenwen · 0211193c · 0211193c · 0211193c · 0211193c
Commit 0211193c authored Aug 17, 2023 by zhuwenwen
20 changed files
--- a/3rdparty/cub/block/block_run_length_decode.hpp
+++ b/3rdparty/cub/block/block_run_length_decode.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
+
+#include "../config.hpp"
+#include "../thread/thread_search.cuh"
+#include "../util_math.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "block_scan.cuh"
+#include <limits>
+#include <type_traits>
+
+BEGIN_HIPCUB_NAMESPACE
+
+/**
+ * \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given
+ * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
+ * array.
+ * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
+ * array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows
+ * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
+ * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
+ *
+ * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
+ * A run of length zero may not be followed by a run length that is not zero.
+ *
+ * \par
+ * \code
+ * __global__ void ExampleKernel(...)
+ * {
+ *   // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
+ *   using RunItemT = uint64_t;
+ *   // Type large enough to index into the run-length decoded array
+ *   using RunLengthT = uint32_t;
+ *
+ *   // Specialising BlockRunLengthDecode for a 1D block of 128 threads
+ *   constexpr int BLOCK_DIM_X = 128;
+ *   // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
+ *   constexpr int RUNS_PER_THREAD = 2;
+ *   // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
+ *   constexpr int DECODED_ITEMS_PER_THREAD = 4;
+ *
+ *   // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *   using BlockRunLengthDecodeT =
+ *     cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+ *
+ *   // Allocate shared memory for BlockRunLengthDecode
+ *   __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
+ *
+ *   // The run-length encoded items and how often they shall be repeated in the run-length decoded output
+ *   RunItemT run_values[RUNS_PER_THREAD];
+ *   RunLengthT run_lengths[RUNS_PER_THREAD];
+ *   ...
+ *
+ *   // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
+ *   uint32_t total_decoded_size = 0;
+ *   BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
+ *
+ *   // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
+ *   // have been decoded.
+ *   uint32_t decoded_window_offset = 0U;
+ *   while (decoded_window_offset < total_decoded_size)
+ *   {
+ *     RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+ *     RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+ *
+ *     // The number of decoded items that are valid within this window (aka pass) of run-length decoding
+ *     uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
+ *     block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
+ *
+ *     decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
+ *
+ *     ...
+ *   }
+ * }
+ * \endcode
+ * \par
+ * Suppose the set of input \p run_values across the block of threads is
+ * <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
+ * \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
+ * The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
+ * [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
+ * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
+ *
+ * \tparam ItemT The data type of the items being run-length decoded
+ * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
+ * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
+ * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
+ * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
+ * runs' lengths)
+ * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
+ * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
+ */
+template <typename ItemT,
+          int BLOCK_DIM_X,
+          int RUNS_PER_THREAD,
+          int DECODED_ITEMS_PER_THREAD,
+          typename DecodedOffsetT = uint32_t,
+          int BLOCK_DIM_Y = 1,
+          int BLOCK_DIM_Z = 1>
+class BlockRunLengthDecode
+{
+    //---------------------------------------------------------------------
+    // CONFIGS & TYPE ALIASES
+    //---------------------------------------------------------------------
+private:
+    /// The thread block size in threads
+    static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+
+    /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
+    static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
+
+    /// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
+    using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_WARP_SCANS, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+    /// Type used to index into the block's runs
+    using RunOffsetT = uint32_t;
+
+    /// Shared memory type required by this thread block
+    union _TempStorage
+    {
+        typename RunOffsetScanT::TempStorage offset_scan;
+        struct
+        {
+            ItemT run_values[BLOCK_RUNS];
+            DecodedOffsetT run_offsets[BLOCK_RUNS];
+        } runs;
+    }; // union TempStorage
+
+    /// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
+    HIPCUB_DEVICE __forceinline__ _TempStorage &PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    uint32_t linear_tid;
+
+public:
+    struct TempStorage : Uninitialized<_TempStorage>
+    {
+    };
+
+    //---------------------------------------------------------------------
+    // CONSTRUCTOR
+    //---------------------------------------------------------------------
+
+    /**
+   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
+   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+   * <b>RunLengthDecode</b> calls.
+   */
+    template <typename RunLengthT, typename TotalDecodedSizeT>
+    HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
+                                                       ItemT (&run_values)[RUNS_PER_THREAD],
+                                                       RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                       TotalDecodedSizeT &total_decoded_size)
+        : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {
+        InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+    }
+
+    /**
+   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
+   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+   * <b>RunLengthDecode</b> calls.
+   */
+    template <typename UserRunOffsetT>
+    HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
+                                                       ItemT (&run_values)[RUNS_PER_THREAD],
+                                                       UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+        : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {
+        InitWithRunOffsets(run_values, run_offsets);
+    }
+
+    /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
+   */
+    template <typename RunLengthT, typename TotalDecodedSizeT>
+    HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                       RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                       TotalDecodedSizeT &total_decoded_size)
+        : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {
+        InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+    }
+
+    /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
+   */
+    template <typename UserRunOffsetT>
+    HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                    UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+        : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {
+        InitWithRunOffsets(run_values, run_offsets);
+    }
+
+private:
+    /**
+   * \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes
+   * \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search
+   * iterations at compile time.
+   */
+    template <int MAX_NUM_ITEMS,
+              typename InputIteratorT,
+              typename OffsetT,
+              typename T>
+    HIPCUB_DEVICE __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence
+                                                           OffsetT num_items,    ///< [in] Input sequence length
+                                                           T val)                ///< [in] Search key
+    {
+        OffsetT lower_bound = 0;
+        OffsetT upper_bound = num_items;
+        #pragma unroll
+        for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
+        {
+            OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
+            mid = (rocprim::min)(mid, num_items - 1);
+
+            if (val < input[mid])
+            {
+                upper_bound = mid;
+            }
+            else
+            {
+                lower_bound = mid + 1;
+            }
+        }
+
+        return lower_bound;
+    }
+
+    template <typename RunOffsetT>
+    HIPCUB_DEVICE __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                          RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+    {
+        // Keep the runs' items and the offsets of each run's beginning in the temporary storage
+        RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
+        #pragma unroll
+        for (int i = 0; i < RUNS_PER_THREAD; i++)
+        {
+            temp_storage.runs.run_values[thread_dst_offset] = run_values[i];
+            temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
+            thread_dst_offset++;
+        }
+
+        // Ensure run offsets and run values have been writen to shared memory
+        CTA_SYNC();
+    }
+
+    template <typename RunLengthT, typename TotalDecodedSizeT>
+    HIPCUB_DEVICE __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                          RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                          TotalDecodedSizeT &total_decoded_size)
+    {
+        // Compute the offset for the beginning of each run
+        DecodedOffsetT run_offsets[RUNS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < RUNS_PER_THREAD; i++)
+        {
+            run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
+        }
+        DecodedOffsetT decoded_size_aggregate;
+        RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
+        total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
+
+        // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
+        CTA_SYNC();
+
+        InitWithRunOffsets(run_values, run_offsets);
+    }
+
+public:
+    /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * <b>RunLengthDecode</b> is not required.
+   * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
+   * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
+   * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
+   * \smemreuse
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+    template <typename RelativeOffsetT>
+    HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                                                       RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
+                                                       DecodedOffsetT from_decoded_offset = 0)
+    {
+        // The (global) offset of the first item decoded by this thread
+        DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
+
+        // The run that the first decoded item of this thread belongs to
+        // If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
+        // last run
+        RunOffsetT assigned_run =
+            StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) -
+            static_cast<RunOffsetT>(1U);
+
+        DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+        // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+        DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
+                                              ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+                                              : temp_storage.runs.run_offsets[assigned_run + 1];
+
+        ItemT val = temp_storage.runs.run_values[assigned_run];
+
+        #pragma unroll
+        for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
+        {
+            decoded_items[i] = val;
+            item_offsets[i] = thread_decoded_offset - assigned_run_begin;
+            if (thread_decoded_offset == assigned_run_end - 1)
+            {
+                // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
+                // extending the last run's length to all the thread's item
+                assigned_run++;
+                assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+                // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+                assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+                                                                    : temp_storage.runs.run_offsets[assigned_run + 1];
+                val = temp_storage.runs.run_values[assigned_run];
+            }
+            thread_decoded_offset++;
+        }
+    }
+
+    /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * <b>RunLengthDecode</b> is not required.
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+    HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                                                       DecodedOffsetT from_decoded_offset = 0)
+    {
+        DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
+        RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
--- a/3rdparty/cub/block/block_scan.cuh
+++ b/3rdparty/cub/block/block_scan.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/block/block_scan.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+namespace detail
+{
+    inline constexpr
+    typename std::underlying_type<::rocprim::block_scan_algorithm>::type
+    to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm v)
+    {
+        using utype = std::underlying_type<::rocprim::block_scan_algorithm>::type;
+        return static_cast<utype>(v);
+    }
+}
+
+enum BlockScanAlgorithm
+{
+    BLOCK_SCAN_RAKING
+        = detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::reduce_then_scan),
+    BLOCK_SCAN_RAKING_MEMOIZE
+        = detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::reduce_then_scan),
+    BLOCK_SCAN_WARP_SCANS
+        = detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::using_warp_scan)
+};
+
+template<
+    typename T,
+    int BLOCK_DIM_X,
+    BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockScan
+    : private ::rocprim::block_scan<
+        T,
+        BLOCK_DIM_X,
+        static_cast<::rocprim::block_scan_algorithm>(ALGORITHM),
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_scan<
+            T,
+            BLOCK_DIM_X,
+            static_cast<::rocprim::block_scan_algorithm>(ALGORITHM),
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockScan() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockScan(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T input, T& output)
+    {
+        base_type::inclusive_scan(input, output, temp_storage_);
+    }
+
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T input, T& output, T& block_aggregate)
+    {
+        base_type::inclusive_scan(input, output, block_aggregate, temp_storage_);
+    }
+
+    template<typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::inclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
+        );
+    }
+
+    template<int ITEMS_PER_THREAD>
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
+    {
+        base_type::inclusive_scan(input, output, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD>
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                      T& block_aggregate)
+    {
+        base_type::inclusive_scan(input, output, block_aggregate, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                      BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::inclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
+        );
+    }
+
+    template<typename ScanOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T input, T& output, ScanOp scan_op)
+    {
+        base_type::inclusive_scan(input, output, temp_storage_, scan_op);
+    }
+
+    template<typename ScanOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
+    {
+        base_type::inclusive_scan(input, output, block_aggregate, temp_storage_, scan_op);
+    }
+
+    template<typename ScanOp, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::inclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, scan_op
+        );
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
+    {
+        base_type::inclusive_scan(input, output, temp_storage_, scan_op);
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       ScanOp scan_op, T& block_aggregate)
+    {
+        base_type::inclusive_scan(input, output, block_aggregate, temp_storage_, scan_op);
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::inclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, scan_op
+        );
+    }
+
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T input, T& output)
+    {
+        base_type::exclusive_scan(input, output, T(0), temp_storage_);
+    }
+
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T input, T& output, T& block_aggregate)
+    {
+        base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
+    }
+
+    template<typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::exclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
+        );
+    }
+
+    template<int ITEMS_PER_THREAD>
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
+    {
+        base_type::exclusive_scan(input, output, T(0), temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD>
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                      T& block_aggregate)
+    {
+        base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                      BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::exclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
+        );
+    }
+
+    template<typename ScanOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
+    {
+        base_type::exclusive_scan(input, output, initial_value, temp_storage_, scan_op);
+    }
+
+    template<typename ScanOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T input, T& output, T initial_value,
+                       ScanOp scan_op, T& block_aggregate)
+    {
+        base_type::exclusive_scan(
+            input, output, initial_value, block_aggregate, temp_storage_, scan_op
+        );
+    }
+
+    template<typename ScanOp, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T input, T& output, ScanOp scan_op,
+                       BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::exclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, scan_op
+        );
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       T initial_value, ScanOp scan_op)
+    {
+        base_type::exclusive_scan(input, output, initial_value, temp_storage_, scan_op);
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       T initial_value, ScanOp scan_op, T& block_aggregate)
+    {
+        base_type::exclusive_scan(
+            input, output, initial_value, block_aggregate, temp_storage_, scan_op
+        );
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::exclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, scan_op
+        );
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
--- a/3rdparty/cub/block/block_shuffle.cuh
+++ b/3rdparty/cub/block/block_shuffle.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/block/block_shuffle.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+
+
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 ARCH            = HIPCUB_ARCH>
+class BlockShuffle : public ::rocprim::block_shuffle<
+                    T,
+                    BLOCK_DIM_X,
+                    BLOCK_DIM_Y,
+                    BLOCK_DIM_Z>
+{
+  static_assert(
+      BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+      "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+  );
+
+  using base_type =
+      typename ::rocprim::block_shuffle<
+          T,
+          BLOCK_DIM_X,
+          BLOCK_DIM_Y,
+          BLOCK_DIM_Z
+      >;
+
+  // Reference to temporary storage (usually shared memory)
+  typename base_type::storage_type& temp_storage_;
+
+public:
+  using TempStorage = typename base_type::storage_type;
+
+  HIPCUB_DEVICE inline
+  BlockShuffle()  :      temp_storage_(private_storage())
+  {}
+
+
+  HIPCUB_DEVICE inline
+  BlockShuffle(TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+  :      temp_storage_(temp_storage)
+  {}
+
+  /**
+   * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+   *
+   * \par
+   * - \smemreuse
+   */
+  HIPCUB_DEVICE inline void Offset(
+      T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
+      T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+      int distance = 1)           ///< [in] Offset distance (may be negative)
+  {
+    base_type::offset(input,output,distance);
+  }
+
+  /**
+ * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+ *
+ * \par
+ * - \smemreuse
+ */
+  HIPCUB_DEVICE inline void Rotate(
+      T   input,                  ///< [in] The calling thread's input item
+      T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
+      unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+  {
+    base_type::rotate(input,output,distance);
+  }
+  /**
+  * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+  *
+  * \par
+  * - \blocked
+  * - \granularity
+  * - \smemreuse
+  */
+  template <int ITEMS_PER_THREAD>
+  HIPCUB_DEVICE inline void Up(
+    T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+    T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+  {
+    base_type::up(input,prev);
+  }
+
+
+   /**
+   * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
+   *
+   * \par
+   * - \blocked
+   * - \granularity
+   * - \smemreuse
+   */
+  template <int ITEMS_PER_THREAD>
+  HIPCUB_DEVICE inline void Up(
+      T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+      T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+      T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+  {
+    base_type::up(input,prev,block_suffix);
+  }
+
+   /**
+   * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+   *
+   * \par
+   * - \blocked
+   * - \granularity
+   * - \smemreuse
+   */
+  template <int ITEMS_PER_THREAD>
+  HIPCUB_DEVICE inline void Down(
+      T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+      T (&next)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p next[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+  {
+    base_type::down(input,next);
+  }
+
+   /**
+   * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+   *
+   * \par
+   * - \blocked
+   * - \granularity
+   * - \smemreuse
+   */
+  template <int ITEMS_PER_THREAD>
+  HIPCUB_DEVICE inline void Down(
+      T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+      T (&next)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p next[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+      T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+  {
+    base_type::down(input,next,block_prefix);
+  }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
--- a/3rdparty/cub/block/block_store.cuh
+++ b/3rdparty/cub/block/block_store.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+#include "block_store_func.hpp"
+
+#include <cub/rocprim/block/block_store.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+namespace detail
+{
+    inline constexpr
+    typename std::underlying_type<::rocprim::block_store_method>::type
+    to_BlockStoreAlgorithm_enum(::rocprim::block_store_method v)
+    {
+        using utype = std::underlying_type<::rocprim::block_store_method>::type;
+        return static_cast<utype>(v);
+    }
+}
+
+enum BlockStoreAlgorithm
+{
+    BLOCK_STORE_DIRECT
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_direct),
+    BLOCK_STORE_STRIPED
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_striped),
+    BLOCK_STORE_VECTORIZE
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_vectorize),
+    BLOCK_STORE_TRANSPOSE
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_transpose),
+    BLOCK_STORE_WARP_TRANSPOSE
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_warp_transpose),
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_warp_transpose)
+};
+
+template<
+    typename T,
+    int BLOCK_DIM_X,
+    int ITEMS_PER_THREAD,
+    BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT,
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockStore
+    : private ::rocprim::block_store<
+        T,
+        BLOCK_DIM_X,
+        ITEMS_PER_THREAD,
+        static_cast<::rocprim::block_store_method>(ALGORITHM),
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_store<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            static_cast<::rocprim::block_store_method>(ALGORITHM),
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockStore() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockStore(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    template<class OutputIteratorT>
+    HIPCUB_DEVICE inline
+    void Store(OutputIteratorT block_iter,
+               T (&items)[ITEMS_PER_THREAD])
+    {
+        base_type::store(block_iter, items, temp_storage_);
+    }
+
+    template<class OutputIteratorT>
+    HIPCUB_DEVICE inline
+    void Store(OutputIteratorT block_iter,
+               T (&items)[ITEMS_PER_THREAD],
+               int valid_items)
+    {
+        base_type::store(block_iter, items, valid_items, temp_storage_);
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
--- a/3rdparty/cub/block/block_store_func.hpp
+++ b/3rdparty/cub/block/block_store_func.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/block/block_store_func.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectBlocked(int linear_id,
+                        OutputIteratorT block_iter,
+                        T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_store_direct_blocked(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectBlocked(int linear_id,
+                        OutputIteratorT block_iter,
+                        T (&items)[ITEMS_PER_THREAD],
+                        int valid_items)
+{
+    ::rocprim::block_store_direct_blocked(
+        linear_id, block_iter, items, valid_items
+    );
+}
+
+template <
+    typename T,
+    int ITEMS_PER_THREAD
+>
+HIPCUB_DEVICE inline
+void StoreDirectBlockedVectorized(int linear_id,
+                                  T* block_iter,
+                                  T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_store_direct_blocked_vectorized(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    int BLOCK_THREADS,
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectStriped(int linear_id,
+                        OutputIteratorT block_iter,
+                        T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_store_direct_striped<BLOCK_THREADS>(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    int BLOCK_THREADS,
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectStriped(int linear_id,
+                        OutputIteratorT block_iter,
+                        T (&items)[ITEMS_PER_THREAD],
+                        int valid_items)
+{
+    ::rocprim::block_store_direct_striped<BLOCK_THREADS>(
+        linear_id, block_iter, items, valid_items
+    );
+}
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectWarpStriped(int linear_id,
+                            OutputIteratorT block_iter,
+                            T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_store_direct_warp_striped(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectWarpStriped(int linear_id,
+                            OutputIteratorT block_iter,
+                            T (&items)[ITEMS_PER_THREAD],
+                            int valid_items)
+{
+    ::rocprim::block_store_direct_warp_striped(
+        linear_id, block_iter, items, valid_items
+    );
+}
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
--- a/3rdparty/cub/block/radix_rank_sort_operations.hpp
+++ b/3rdparty/cub/block/radix_rank_sort_operations.hpp
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * radix_rank_sort_operations.cuh contains common abstractions, definitions and
+ * operations used for radix sorting and ranking.
+ */
+
+ #ifndef HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
+ #define HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+ #include <cub/rocprim/config.hpp>
+ #include <cub/rocprim/type_traits.hpp>
+ #include <cub/rocprim/detail/various.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+/** \brief Twiddling keys for radix sort. */
+template <bool IS_DESCENDING, typename KeyT>
+struct RadixSortTwiddle
+{
+    typedef Traits<KeyT> TraitsT;
+    typedef typename TraitsT::UnsignedBits UnsignedBits;
+    static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits In(UnsignedBits key)
+    {
+        key = TraitsT::TwiddleIn(key);
+        if (IS_DESCENDING) key = ~key;
+        return key;
+    }
+    static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits Out(UnsignedBits key)
+    {
+        if (IS_DESCENDING) key = ~key;
+        key = TraitsT::TwiddleOut(key);
+        return key;
+    }
+    static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits DefaultKey()
+    {
+        return Out(~UnsignedBits(0));
+    }
+};
+
+/** \brief Base struct for digit extractor. Contains common code to provide
+    special handling for floating-point -0.0.
+
+    \note This handles correctly both the case when the keys are
+    bitwise-complemented after twiddling for descending sort (in onesweep) as
+    well as when the keys are not bit-negated, but the implementation handles
+    descending sort separately (in other implementations in CUB). Twiddling
+    alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are
+    subsequent bit patterns and bitwise complements of each other. For onesweep,
+    both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for
+    ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending
+    sort. For all other sorting implementations in CUB, both are always mapped
+    to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other
+    and only one of them is used, the sorting works correctly. For double, the
+    same applies, but with 64-bit patterns.
+*/
+    template <typename KeyT>
+    struct BaseDigitExtractor
+    {
+        typedef Traits<KeyT> TraitsT;
+        typedef typename TraitsT::UnsignedBits UnsignedBits;
+
+        enum
+        {
+            FLOAT_KEY = TraitsT::CATEGORY == FLOATING_POINT,
+        };
+
+        static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
+        {
+            if (!FLOAT_KEY) {
+                return key;
+            } else {
+                UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
+                    TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
+                UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
+                return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
+            }
+        }
+    };
+
+/** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a
+ * key from a digit. */
+    template <typename KeyT>
+    struct BFEDigitExtractor : BaseDigitExtractor<KeyT>
+    {
+        using typename BaseDigitExtractor<KeyT>::UnsignedBits;
+
+        uint32_t bit_start, num_bits;
+        explicit __device__ __forceinline__ BFEDigitExtractor(
+            uint32_t bit_start = 0, uint32_t num_bits = 0)
+            : bit_start(bit_start), num_bits(num_bits)
+        { }
+
+        __device__ __forceinline__ uint32_t Digit(UnsignedBits key)
+        {
+            return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits);
+        }
+    };
+
+/** \brief A wrapper type to extract digits. Uses a combination of shift and
+ * bitwise and to extract digits. */
+    template <typename KeyT>
+    struct ShiftDigitExtractor : BaseDigitExtractor<KeyT>
+    {
+        using typename BaseDigitExtractor<KeyT>::UnsignedBits;
+
+        uint32_t bit_start, mask;
+        explicit __device__ __forceinline__ ShiftDigitExtractor(
+            uint32_t bit_start = 0, uint32_t num_bits = 0)
+            : bit_start(bit_start), mask((1 << num_bits) - 1)
+        { }
+
+        __device__ __forceinline__ uint32_t Digit(UnsignedBits key)
+        {
+            return uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask;
+        }
+    };
+
+END_HIPCUB_NAMESPACE
+
+#endif //HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
--- a/3rdparty/cub/config.hpp
+++ b/3rdparty/cub/config.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2019-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_CONFIG_HPP_
+#define HIPCUB_CONFIG_HPP_
+
+#include <cuda_runtime.h>
+
+#define HIPCUB_NAMESPACE cub
+
+#define BEGIN_HIPCUB_NAMESPACE \
+    namespace cub {
+
+#define END_HIPCUB_NAMESPACE \
+    } /* hipcub */
+    
+#ifndef HIPCUB_ARCH
+#define HIPCUB_ARCH 1
+#endif
+
+#define CUB_DEVICE_WARP_THREADS 64
+
+#ifdef __CUDACC__
+    #define HIPCUB_ROCPRIM_API 1
+    #define HIPCUB_RUNTIME_FUNCTION __host__
+#elif defined(__HIP_PLATFORM_NVIDIA__)
+    #define HIPCUB_CUB_API 1
+    #define HIPCUB_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION
+
+    #include <cub/util_arch.cuh>
+    #define HIPCUB_WARP_THREADS CUB_PTX_WARP_THREADS
+    #define HIPCUB_DEVICE_WARP_THREADS CUB_PTX_WARP_THREADS
+    #define HIPCUB_HOST_WARP_THREADS CUB_PTX_WARP_THREADS
+    #define HIPCUB_ARCH CUB_PTX_ARCH
+    BEGIN_HIPCUB_NAMESPACE
+    using namespace cub;
+    END_HIPCUB_NAMESPACE
+#endif
+
+/// Supported warp sizes
+#define HIPCUB_WARP_SIZE_32 32u
+#define HIPCUB_WARP_SIZE_64 64u
+#define HIPCUB_MAX_WARP_SIZE HIPCUB_WARP_SIZE_64
+
+#define HIPCUB_HOST __host__
+#define HIPCUB_DEVICE __device__
+#define HIPCUB_HOST_DEVICE __host__ __device__
+#define HIPCUB_SHARED_MEMORY __shared__
+
+// Helper macros to disable warnings in clang
+#ifdef __clang__
+#define HIPCUB_PRAGMA_TO_STR(x) _Pragma(#x)
+#define HIPCUB_CLANG_SUPPRESS_WARNING_PUSH _Pragma("clang diagnostic push")
+#define HIPCUB_CLANG_SUPPRESS_WARNING(w) HIPCUB_PRAGMA_TO_STR(clang diagnostic ignored w)
+#define HIPCUB_CLANG_SUPPRESS_WARNING_POP _Pragma("clang diagnostic pop")
+#define HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH(w) \
+    HIPCUB_CLANG_SUPPRESS_WARNING_PUSH HIPCUB_CLANG_SUPPRESS_WARNING(w)
+#else // __clang__
+#define HIPCUB_CLANG_SUPPRESS_WARNING_PUSH
+#define HIPCUB_CLANG_SUPPRESS_WARNING(w)
+#define HIPCUB_CLANG_SUPPRESS_WARNING_POP
+#define HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH(w)
+#endif // __clang__
+
+BEGIN_HIPCUB_NAMESPACE
+
+/// hipCUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(HIPCUB_STDERR)
+    #define HIPCUB_STDERR
+#endif
+
+inline
+cudaError_t Debug(
+    cudaError_t      error,
+    const char*     filename,
+    int             line)
+{
+    (void)filename;
+    (void)line;
+#ifdef HIPCUB_STDERR
+    if (error)
+    {
+        fprintf(stderr, "cuda error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+        fflush(stderr);
+    }
+#endif
+    return error;
+}
+
+#ifndef cubDebug
+    #define cubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_CONFIG_HPP_
--- a/3rdparty/cub/cub.cuh
+++ b/3rdparty/cub/cub.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_HIPCUB_HPP_
+#define HIPCUB_ROCPRIM_HIPCUB_HPP_
+
+#include "config.hpp"
+#include "version.cuh"
+
+#include "util_allocator.cuh"
+#include "util_type.cuh"
+#include "util_ptx.cuh"
+#include "thread/thread_operators.cuh"
+
+// Iterator
+#include "iterator/arg_index_input_iterator.cuh"
+#include "iterator/cache_modified_input_iterator.cuh"
+#include "iterator/cache_modified_output_iterator.cuh"
+#include "iterator/constant_input_iterator.cuh"
+#include "iterator/counting_input_iterator.cuh"
+#include "iterator/discard_output_iterator.cuh"
+#include "iterator/tex_obj_input_iterator.cuh"
+#include "iterator/tex_ref_input_iterator.cuh"
+#include "iterator/transform_input_iterator.cuh"
+
+// Warp
+#include "warp/warp_exchange.hpp"
+#include "warp/warp_load.hpp"
+#include "warp/warp_merge_sort.hpp"
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+#include "warp/warp_store.hpp"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_search.cuh"
+#include "thread/thread_sort.hpp"
+#include "thread/thread_store.cuh"
+
+// Block
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_histogram.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+
+// Device
+#include "device/device_adjacent_difference.hpp"
+#include "device/device_histogram.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_run_length_encode.cuh"
+#include "device/device_scan.cuh"
+#include "device/device_segmented_radix_sort.cuh"
+#include "device/device_segmented_reduce.cuh"
+#include "device/device_segmented_sort.hpp"
+#include "device/device_select.cuh"
+#include "device/device_partition.cuh"
+
+#endif // HIPCUB_ROCPRIM_HIPCUB_HPP_
--- a/3rdparty/cub/device/device_adjacent_difference.hpp
+++ b/3rdparty/cub/device/device_adjacent_difference.hpp
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2022, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
+
+#include "../config.hpp"
+
+#include <cub/thread/thread_operators.cuh>
+#include <cub/rocprim/device/device_adjacent_difference.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceAdjacentDifference
+{
+    template <typename InputIteratorT,
+              typename OutputIteratorT,
+              typename DifferenceOpT = ::cub::Difference>
+    static HIPCUB_RUNTIME_FUNCTION cudaError_t
+    SubtractLeftCopy(void *d_temp_storage,
+                     std::size_t &temp_storage_bytes,
+                     InputIteratorT d_input,
+                     OutputIteratorT d_output,
+                     std::size_t num_items,
+                     DifferenceOpT difference_op = {},
+                     cudaStream_t stream = 0,
+                     bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::adjacent_difference(
+            d_temp_storage, temp_storage_bytes, d_input, d_output,
+            num_items, difference_op, stream, debug_synchronous
+        );
+    }
+
+    template <typename RandomAccessIteratorT,
+              typename DifferenceOpT = ::cub::Difference>
+    static HIPCUB_RUNTIME_FUNCTION cudaError_t
+    SubtractLeft(void *d_temp_storage,
+                 std::size_t &temp_storage_bytes,
+                 RandomAccessIteratorT d_input,
+                 std::size_t num_items,
+                 DifferenceOpT difference_op = {},
+                 cudaStream_t stream = 0,
+                 bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::adjacent_difference_inplace(
+            d_temp_storage, temp_storage_bytes, d_input,
+            num_items, difference_op, stream, debug_synchronous
+        );
+    }
+
+    template <typename InputIteratorT,
+              typename OutputIteratorT,
+              typename DifferenceOpT = ::cub::Difference>
+    static HIPCUB_RUNTIME_FUNCTION cudaError_t
+    SubtractRightCopy(void *d_temp_storage,
+                      std::size_t &temp_storage_bytes,
+                      InputIteratorT d_input,
+                      OutputIteratorT d_output,
+                      std::size_t num_items,
+                      DifferenceOpT difference_op = {},
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::adjacent_difference_right(
+            d_temp_storage, temp_storage_bytes, d_input, d_output,
+            num_items, difference_op, stream, debug_synchronous
+        );
+    }
+
+    template <typename RandomAccessIteratorT,
+              typename DifferenceOpT = ::cub::Difference>
+    static HIPCUB_RUNTIME_FUNCTION cudaError_t
+    SubtractRight(void *d_temp_storage,
+                  std::size_t &temp_storage_bytes,
+                  RandomAccessIteratorT d_input,
+                  std::size_t num_items,
+                  DifferenceOpT difference_op = {},
+                  cudaStream_t stream = 0,
+                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::adjacent_difference_right_inplace(
+            d_temp_storage, temp_storage_bytes, d_input,
+            num_items, difference_op, stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
--- a/3rdparty/cub/device/device_histogram.cuh
+++ b/3rdparty/cub/device/device_histogram.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_histogram.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceHistogram
+{
+    template<
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t HistogramEven(void * d_temp_storage,
+                             size_t& temp_storage_bytes,
+                             SampleIteratorT d_samples,
+                             CounterT * d_histogram,
+                             int num_levels,
+                             LevelT lower_level,
+                             LevelT upper_level,
+                             OffsetT num_samples,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::histogram_even(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_samples,
+            d_histogram,
+            num_levels, lower_level, upper_level,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t HistogramEven(void * d_temp_storage,
+                             size_t& temp_storage_bytes,
+                             SampleIteratorT d_samples,
+                             CounterT * d_histogram,
+                             int num_levels,
+                             LevelT lower_level,
+                             LevelT upper_level,
+                             OffsetT num_row_samples,
+                             OffsetT num_rows,
+                             size_t row_stride_bytes,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::histogram_even(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_row_samples, num_rows, row_stride_bytes,
+            d_histogram,
+            num_levels, lower_level, upper_level,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        int NUM_CHANNELS,
+        int NUM_ACTIVE_CHANNELS,
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t MultiHistogramEven(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  SampleIteratorT d_samples,
+                                  CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
+                                  int num_levels[NUM_ACTIVE_CHANNELS],
+                                  LevelT lower_level[NUM_ACTIVE_CHANNELS],
+                                  LevelT upper_level[NUM_ACTIVE_CHANNELS],
+                                  OffsetT num_pixels,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        unsigned int levels[NUM_ACTIVE_CHANNELS];
+        for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
+        {
+            levels[channel] = num_levels[channel];
+        }
+        return (cudaError_t)::rocprim::multi_histogram_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_pixels,
+            d_histogram,
+            levels, lower_level, upper_level,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        int NUM_CHANNELS,
+        int NUM_ACTIVE_CHANNELS,
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t MultiHistogramEven(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  SampleIteratorT d_samples,
+                                  CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
+                                  int num_levels[NUM_ACTIVE_CHANNELS],
+                                  LevelT lower_level[NUM_ACTIVE_CHANNELS],
+                                  LevelT upper_level[NUM_ACTIVE_CHANNELS],
+                                  OffsetT num_row_pixels,
+                                  OffsetT num_rows,
+                                  size_t row_stride_bytes,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        unsigned int levels[NUM_ACTIVE_CHANNELS];
+        for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
+        {
+            levels[channel] = num_levels[channel];
+        }
+        return (cudaError_t)::rocprim::multi_histogram_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_row_pixels, num_rows, row_stride_bytes,
+            d_histogram,
+            levels, lower_level, upper_level,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t HistogramRange(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              SampleIteratorT d_samples,
+                              CounterT * d_histogram,
+                              int num_levels,
+                              LevelT * d_levels,
+                              OffsetT num_samples,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::histogram_range(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_samples,
+            d_histogram,
+            num_levels, d_levels,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t HistogramRange(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              SampleIteratorT d_samples,
+                              CounterT * d_histogram,
+                              int num_levels,
+                              LevelT * d_levels,
+                              OffsetT num_row_samples,
+                              OffsetT num_rows,
+                              size_t row_stride_bytes,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::histogram_range(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_row_samples, num_rows, row_stride_bytes,
+            d_histogram,
+            num_levels, d_levels,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        int NUM_CHANNELS,
+        int NUM_ACTIVE_CHANNELS,
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t MultiHistogramRange(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   SampleIteratorT d_samples,
+                                   CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
+                                   int num_levels[NUM_ACTIVE_CHANNELS],
+                                   LevelT * d_levels[NUM_ACTIVE_CHANNELS],
+                                   OffsetT num_pixels,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        unsigned int levels[NUM_ACTIVE_CHANNELS];
+        for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
+        {
+            levels[channel] = num_levels[channel];
+        }
+        return (cudaError_t)::rocprim::multi_histogram_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_pixels,
+            d_histogram,
+            levels, d_levels,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        int NUM_CHANNELS,
+        int NUM_ACTIVE_CHANNELS,
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t MultiHistogramRange(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   SampleIteratorT d_samples,
+                                   CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
+                                   int num_levels[NUM_ACTIVE_CHANNELS],
+                                   LevelT * d_levels[NUM_ACTIVE_CHANNELS],
+                                   OffsetT num_row_pixels,
+                                   OffsetT num_rows,
+                                   size_t row_stride_bytes,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        unsigned int levels[NUM_ACTIVE_CHANNELS];
+        for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
+        {
+            levels[channel] = num_levels[channel];
+        }
+        return (cudaError_t)::rocprim::multi_histogram_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_row_pixels, num_rows, row_stride_bytes,
+            d_histogram,
+            levels, d_levels,
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
--- a/3rdparty/cub/device/device_merge_sort.hpp
+++ b/3rdparty/cub/device/device_merge_sort.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_merge_sort.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceMergeSort
+{
+    template<typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *         d_temp_storage,
+                                                        std::size_t &  temp_storage_bytes,
+                                                        KeyIteratorT   d_keys,
+                                                        ValueIteratorT d_items,
+                                                        OffsetT        num_items,
+                                                        CompareOpT     compare_op,
+                                                        cudaStream_t    stream            = 0,
+                                                        bool           debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_keys,
+                                     d_keys,
+                                     d_items,
+                                     d_items,
+                                     num_items,
+                                     compare_op,
+                                     stream,
+                                     debug_synchronous);
+    }
+
+    template<typename KeyInputIteratorT,
+             typename ValueInputIteratorT,
+             typename KeyIteratorT,
+             typename ValueIteratorT,
+             typename OffsetT,
+             typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(void *              d_temp_storage,
+                                                            std::size_t &       temp_storage_bytes,
+                                                            KeyInputIteratorT   d_input_keys,
+                                                            ValueInputIteratorT d_input_items,
+                                                            KeyIteratorT        d_output_keys,
+                                                            ValueIteratorT      d_output_items,
+                                                            OffsetT             num_items,
+                                                            CompareOpT          compare_op,
+                                                            cudaStream_t         stream = 0,
+                                                            bool debug_synchronous     = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_input_keys,
+                                     d_output_keys,
+                                     d_input_items,
+                                     d_output_items,
+                                     num_items,
+                                     compare_op,
+                                     stream,
+                                     debug_synchronous);
+    }
+
+    template<typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *        d_temp_storage,
+                                                       std::size_t & temp_storage_bytes,
+                                                       KeyIteratorT  d_keys,
+                                                       OffsetT       num_items,
+                                                       CompareOpT    compare_op,
+                                                       cudaStream_t   stream            = 0,
+                                                       bool          debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, d_keys, num_items,
+            compare_op, stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyInputIteratorT,
+             typename KeyIteratorT,
+             typename OffsetT,
+             typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(void * d_temp_storage,
+                                                           std::size_t & temp_storage_bytes,
+                                                           KeyInputIteratorT d_input_keys,
+                                                           KeyIteratorT      d_output_keys,
+                                                           OffsetT           num_items,
+                                                           CompareOpT        compare_op,
+                                                           cudaStream_t stream = 0,
+                                                           bool debug_synchronous = false)
+
+    {
+        return (cudaError_t)::rocprim::merge_sort(
+            d_temp_storage, temp_storage_bytes,
+            d_input_keys, d_output_keys, num_items,
+            compare_op, stream, debug_synchronous
+            );
+    }
+
+    template <typename KeyIteratorT,
+             typename ValueIteratorT,
+             typename OffsetT,
+             typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t
+    StableSortPairs(void *d_temp_storage,
+                    std::size_t &temp_storage_bytes,
+                    KeyIteratorT d_keys,
+                    ValueIteratorT d_items,
+                    OffsetT num_items,
+                    CompareOpT compare_op,
+                    cudaStream_t stream = 0,
+                    bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_keys,
+                                     d_keys,
+                                     d_items,
+                                     d_items,
+                                     num_items,
+                                     compare_op,
+                                     stream,
+                                     debug_synchronous);
+    }
+
+    template<typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void *        d_temp_storage,
+                                                             std::size_t & temp_storage_bytes,
+                                                             KeyIteratorT  d_keys,
+                                                             OffsetT       num_items,
+                                                             CompareOpT    compare_op,
+                                                             cudaStream_t   stream   = 0,
+                                                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, d_keys, num_items,
+            compare_op, stream, debug_synchronous
+        );
+    }
+
+};
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
--- a/3rdparty/cub/device/device_partition.cuh
+++ b/3rdparty/cub/device/device_partition.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_PARTITION_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_PARTITION_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/device/device_partition.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DevicePartition
+{
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    HIPCUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                 stream             = 0,         ///< [in] <b>[optional]</b> hip stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return (cudaError_t)rocprim::partition(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    HIPCUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                 stream             = 0,         ///< [in] <b>[optional]</b> hip stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return (cudaError_t)rocprim::partition(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            d_num_selected_out,
+            num_items,
+            select_op,
+            stream,
+            debug_synchronous);
+    }
+    
+    template <typename InputIteratorT,
+              typename FirstOutputIteratorT,
+              typename SecondOutputIteratorT,
+              typename UnselectedOutputIteratorT,
+              typename NumSelectedIteratorT,
+              typename SelectFirstPartOp,
+              typename SelectSecondPartOp>
+    HIPCUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+    If(void *d_temp_storage,
+       std::size_t &temp_storage_bytes,
+       InputIteratorT d_in,
+       FirstOutputIteratorT d_first_part_out,
+       SecondOutputIteratorT d_second_part_out,
+       UnselectedOutputIteratorT d_unselected_out,
+       NumSelectedIteratorT d_num_selected_out,
+       int num_items,
+       SelectFirstPartOp select_first_part_op,
+       SelectSecondPartOp select_second_part_op,
+       cudaStream_t stream     = 0,
+       bool debug_synchronous = false)
+    {
+        return (cudaError_t)rocprim::partition_three_way(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_first_part_out,
+            d_second_part_out,
+            d_unselected_out,
+            d_num_selected_out,
+            num_items,
+            select_first_part_op,
+            select_second_part_op,
+            stream,
+            debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif
--- a/3rdparty/cub/device/device_radix_sort.cuh
+++ b/3rdparty/cub/device/device_radix_sort.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_radix_sort.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceRadixSort
+{
+    template<typename KeyT, typename ValueT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         const KeyT * d_keys_in,
+                         KeyT * d_keys_out,
+                         const ValueT * d_values_in,
+                         ValueT * d_values_out,
+                         NumItemsT num_items,
+                         int begin_bit = 0,
+                         int end_bit = sizeof(KeyT) * 8,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         DoubleBuffer<KeyT>& d_keys,
+                         DoubleBuffer<ValueT>& d_values,
+                         NumItemsT num_items,
+                         int begin_bit = 0,
+                         int end_bit = sizeof(KeyT) * 8,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename ValueT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   const KeyT * d_keys_in,
+                                   KeyT * d_keys_out,
+                                   const ValueT * d_values_in,
+                                   ValueT * d_values_out,
+                                   NumItemsT num_items,
+                                   int begin_bit = 0,
+                                   int end_bit = sizeof(KeyT) * 8,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   DoubleBuffer<KeyT>& d_keys,
+                                   DoubleBuffer<ValueT>& d_values,
+                                   NumItemsT num_items,
+                                   int begin_bit = 0,
+                                   int end_bit = sizeof(KeyT) * 8,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        const KeyT * d_keys_in,
+                        KeyT * d_keys_out,
+                        NumItemsT num_items,
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        DoubleBuffer<KeyT>& d_keys,
+                        NumItemsT num_items,
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+
+    template<typename KeyT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  const KeyT * d_keys_in,
+                                  KeyT * d_keys_out,
+                                  NumItemsT num_items,
+                                  int begin_bit = 0,
+                                  int end_bit = sizeof(KeyT) * 8,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  DoubleBuffer<KeyT>& d_keys,
+                                  NumItemsT num_items,
+                                  int begin_bit = 0,
+                                  int end_bit = sizeof(KeyT) * 8,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
--- a/3rdparty/cub/device/device_reduce.cuh
+++ b/3rdparty/cub/device/device_reduce.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
+
+#include <limits>
+#include <iterator>
+
+#include <cuda_fp16.h> // __half
+#include <thrust/system/cuda/cuda_bfloat16.h> // hip_bfloat16
+
+#include "../config.hpp"
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/device/device_reduce.hpp>
+#include <cub/rocprim/device/device_reduce_by_key.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+namespace detail
+{
+
+template<class T>
+inline
+T get_lowest_value()
+{
+    return std::numeric_limits<T>::lowest();
+}
+
+template<>
+inline
+__half get_lowest_value<__half>()
+{
+    unsigned short lowest_half = 0xfbff;
+    __half lowest_value = *reinterpret_cast<__half*>(&lowest_half);
+    return lowest_value;
+}
+
+template<>
+inline
+cuda_bfloat16 get_lowest_value<cuda_bfloat16>()
+{
+    return cuda_bfloat16(-3.38953138925e+38f);
+}
+
+template<class T>
+inline
+T get_max_value()
+{
+    return std::numeric_limits<T>::max();
+}
+
+template<>
+inline
+__half get_max_value<__half>()
+{
+    unsigned short max_half = 0x7bff;
+    __half max_value = *reinterpret_cast<__half*>(&max_half);
+    return max_value;
+}
+
+template<>
+inline
+cuda_bfloat16 get_max_value<cuda_bfloat16>()
+{
+    return cuda_bfloat16(3.38953138925e+38f);
+}
+
+} // end detail namespace
+
+class DeviceReduce
+{
+public:
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename ReduceOpT,
+        typename T
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Reduce(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_items,
+                      ReduceOpT reduction_op,
+                      T init,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, init, num_items,
+            ::cub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Sum(void *d_temp_storage,
+                   size_t &temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_items,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, ::cub::Sum(), T(0),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Min(void *d_temp_storage,
+                   size_t &temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_items,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, ::cub::Min(), detail::get_max_value<T>(),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ArgMin(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_items,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        using OffsetT = int;
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        using O = typename std::iterator_traits<OutputIteratorT>::value_type;
+        using OutputTupleT =
+            typename std::conditional<
+                std::is_same<O, void>::value,
+                KeyValuePair<OffsetT, T>,
+                O
+            >::type;
+
+        using OutputValueT = typename OutputTupleT::Value;
+        using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        IteratorT d_indexed_in(d_in);
+        OutputTupleT init(1, detail::get_max_value<T>());
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_indexed_in, d_out, num_items, ::cub::ArgMin(), init,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Max(void *d_temp_storage,
+                   size_t &temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_items,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, ::cub::Max(), detail::get_lowest_value<T>(),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ArgMax(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_items,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        using OffsetT = int;
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        using O = typename std::iterator_traits<OutputIteratorT>::value_type;
+        using OutputTupleT =
+            typename std::conditional<
+                std::is_same<O, void>::value,
+                KeyValuePair<OffsetT, T>,
+                O
+            >::type;
+
+        using OutputValueT = typename OutputTupleT::Value;
+        using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        IteratorT d_indexed_in(d_in);
+        OutputTupleT init(1, detail::get_lowest_value<T>());
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_indexed_in, d_out, num_items, ::cub::ArgMax(), init,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename KeysInputIteratorT,
+        typename UniqueOutputIteratorT,
+        typename ValuesInputIteratorT,
+        typename AggregatesOutputIteratorT,
+        typename NumRunsOutputIteratorT,
+        typename ReductionOpT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ReduceByKey(void * d_temp_storage,
+                           size_t& temp_storage_bytes,
+                           KeysInputIteratorT d_keys_in,
+                           UniqueOutputIteratorT d_unique_out,
+                           ValuesInputIteratorT d_values_in,
+                           AggregatesOutputIteratorT d_aggregates_out,
+                           NumRunsOutputIteratorT d_num_runs_out,
+                           ReductionOpT reduction_op,
+                           int num_items,
+                           cudaStream_t stream = 0,
+                           bool debug_synchronous = false)
+    {
+        using key_compare_op =
+            ::rocprim::equal_to<typename std::iterator_traits<KeysInputIteratorT>::value_type>;
+        return (cudaError_t)::rocprim::reduce_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, num_items,
+            d_unique_out, d_aggregates_out, d_num_runs_out,
+            ::cub::detail::convert_result_type<ValuesInputIteratorT, AggregatesOutputIteratorT>(reduction_op),
+            key_compare_op(),
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
--- a/3rdparty/cub/device/device_run_length_encode.cuh
+++ b/3rdparty/cub/device/device_run_length_encode.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/device/device_run_length_encode.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+class DeviceRunLengthEncode
+{
+public:
+    template<
+        typename InputIteratorT,
+        typename UniqueOutputIteratorT,
+        typename LengthsOutputIteratorT,
+        typename NumRunsOutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Encode(void * d_temp_storage,
+                      size_t& temp_storage_bytes,
+                      InputIteratorT d_in,
+                      UniqueOutputIteratorT d_unique_out,
+                      LengthsOutputIteratorT d_counts_out,
+                      NumRunsOutputIteratorT d_num_runs_out,
+                      int num_items,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::run_length_encode(
+            d_temp_storage, temp_storage_bytes,
+            d_in, num_items,
+            d_unique_out, d_counts_out, d_num_runs_out,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OffsetsOutputIteratorT,
+        typename LengthsOutputIteratorT,
+        typename NumRunsOutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t NonTrivialRuns(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              InputIteratorT d_in,
+                              OffsetsOutputIteratorT d_offsets_out,
+                              LengthsOutputIteratorT d_lengths_out,
+                              NumRunsOutputIteratorT d_num_runs_out,
+                              int num_items,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::run_length_encode_non_trivial_runs(
+            d_temp_storage, temp_storage_bytes,
+            d_in, num_items,
+            d_offsets_out, d_lengths_out, d_num_runs_out,
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
--- a/3rdparty/cub/device/device_scan.cuh
+++ b/3rdparty/cub/device/device_scan.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
+
+#include <iostream>
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/device/device_scan.hpp>
+#include <cub/rocprim/device/device_scan_by_key.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+class DeviceScan
+{
+public:
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t InclusiveSum(void *d_temp_storage,
+                            size_t &temp_storage_bytes,
+                            InputIteratorT d_in,
+                            OutputIteratorT d_out,
+                            size_t num_items,
+                            cudaStream_t stream = 0,
+                            bool debug_synchronous = false)
+    {
+        return InclusiveScan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, ::cub::Sum(), num_items,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename ScanOpT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t InclusiveScan(void *d_temp_storage,
+                             size_t &temp_storage_bytes,
+                             InputIteratorT d_in,
+                             OutputIteratorT d_out,
+                             ScanOpT scan_op,
+                             size_t num_items,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::inclusive_scan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items,
+            scan_op,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveSum(void *d_temp_storage,
+                            size_t &temp_storage_bytes,
+                            InputIteratorT d_in,
+                            OutputIteratorT d_out,
+                            size_t num_items,
+                            cudaStream_t stream = 0,
+                            bool debug_synchronous = false)
+    {
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        return ExclusiveScan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, ::cub::Sum(), T(0), num_items,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename ScanOpT,
+        typename InitValueT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveScan(void *d_temp_storage,
+                             size_t &temp_storage_bytes,
+                             InputIteratorT d_in,
+                             OutputIteratorT d_out,
+                             ScanOpT scan_op,
+                             InitValueT init_value,
+                             size_t num_items,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::exclusive_scan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, init_value, num_items,
+            scan_op,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename ScanOpT,
+        typename InitValueT,
+        typename InitValueIterT = InitValueT*
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveScan(void *d_temp_storage,
+                             size_t &temp_storage_bytes,
+                             InputIteratorT d_in,
+                             OutputIteratorT d_out,
+                             ScanOpT scan_op,
+                             FutureValue<InitValueT, InitValueIterT> init_value,
+                             int num_items,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::exclusive_scan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, init_value, num_items,
+            scan_op,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeysInputIteratorT,
+        typename ValuesInputIteratorT,
+        typename ValuesOutputIteratorT,
+        typename EqualityOpT = ::cub::Equality
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveSumByKey(void *d_temp_storage,
+                                 size_t &temp_storage_bytes,
+                                 KeysInputIteratorT d_keys_in,
+                                 ValuesInputIteratorT d_values_in,
+                                 ValuesOutputIteratorT d_values_out,
+                                 int num_items,
+                                 EqualityOpT equality_op = EqualityOpT(),
+                                 cudaStream_t stream = 0,
+                                 bool debug_synchronous = false)
+    {
+        using in_value_type = typename std::iterator_traits<ValuesInputIteratorT>::value_type;
+
+        return (cudaError_t)::rocprim::exclusive_scan_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, d_values_out,
+            static_cast<in_value_type>(0), static_cast<size_t>(num_items),
+            ::cub::Sum(), equality_op, stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeysInputIteratorT,
+        typename ValuesInputIteratorT,
+        typename ValuesOutputIteratorT,
+        typename ScanOpT,
+        typename InitValueT,
+        typename EqualityOpT = ::cub::Equality
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveScanByKey(void *d_temp_storage,
+                                  size_t &temp_storage_bytes,
+                                  KeysInputIteratorT d_keys_in,
+                                  ValuesInputIteratorT d_values_in,
+                                  ValuesOutputIteratorT d_values_out,
+                                  ScanOpT scan_op,
+                                  InitValueT init_value,
+                                  int num_items,
+                                  EqualityOpT equality_op = EqualityOpT(),
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::exclusive_scan_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, d_values_out,
+            init_value, static_cast<size_t>(num_items),
+            scan_op, equality_op, stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeysInputIteratorT,
+        typename ValuesInputIteratorT,
+        typename ValuesOutputIteratorT,
+        typename EqualityOpT = ::cub::Equality
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t InclusiveSumByKey(void *d_temp_storage,
+                                 size_t &temp_storage_bytes,
+                                 KeysInputIteratorT d_keys_in,
+                                 ValuesInputIteratorT d_values_in,
+                                 ValuesOutputIteratorT d_values_out,
+                                 int num_items,
+                                 EqualityOpT equality_op = EqualityOpT(),
+                                 cudaStream_t stream = 0,
+                                 bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::inclusive_scan_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, d_values_out,
+            static_cast<size_t>(num_items), ::cub::Sum(),
+            equality_op, stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeysInputIteratorT,
+        typename ValuesInputIteratorT,
+        typename ValuesOutputIteratorT,
+        typename ScanOpT,
+        typename EqualityOpT = ::cub::Equality
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t InclusiveScanByKey(void *d_temp_storage,
+                                  size_t &temp_storage_bytes,
+                                  KeysInputIteratorT d_keys_in,
+                                  ValuesInputIteratorT d_values_in,
+                                  ValuesOutputIteratorT d_values_out,
+                                  ScanOpT scan_op,
+                                  int num_items,
+                                  EqualityOpT equality_op = EqualityOpT(),
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::inclusive_scan_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, d_values_out,
+            static_cast<size_t>(num_items), scan_op,
+            equality_op, stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
--- a/3rdparty/cub/device/device_segmented_radix_sort.cuh
+++ b/3rdparty/cub/device/device_segmented_radix_sort.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_segmented_radix_sort.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceSegmentedRadixSort
+{
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         const KeyT * d_keys_in,
+                         KeyT * d_keys_out,
+                         const ValueT * d_values_in,
+                         ValueT * d_values_out,
+                         int num_items,
+                         int num_segments,
+                         OffsetIteratorT d_begin_offsets,
+                         OffsetIteratorT d_end_offsets,
+                         int begin_bit = 0,
+                         int end_bit = sizeof(KeyT) * 8,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         DoubleBuffer<KeyT>& d_keys,
+                         DoubleBuffer<ValueT>& d_values,
+                         int num_items,
+                         int num_segments,
+                         OffsetIteratorT d_begin_offsets,
+                         OffsetIteratorT d_end_offsets,
+                         int begin_bit = 0,
+                         int end_bit = sizeof(KeyT) * 8,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   const KeyT * d_keys_in,
+                                   KeyT * d_keys_out,
+                                   const ValueT * d_values_in,
+                                   ValueT * d_values_out,
+                                   int num_items,
+                                   int num_segments,
+                                   OffsetIteratorT d_begin_offsets,
+                                   OffsetIteratorT d_end_offsets,
+                                   int begin_bit = 0,
+                                   int end_bit = sizeof(KeyT) * 8,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   DoubleBuffer<KeyT>& d_keys,
+                                   DoubleBuffer<ValueT>& d_values,
+                                   int num_items,
+                                   int num_segments,
+                                   OffsetIteratorT d_begin_offsets,
+                                   OffsetIteratorT d_end_offsets,
+                                   int begin_bit = 0,
+                                   int end_bit = sizeof(KeyT) * 8,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        const KeyT * d_keys_in,
+                        KeyT * d_keys_out,
+                        int num_items,
+                        int num_segments,
+                        OffsetIteratorT d_begin_offsets,
+                        OffsetIteratorT d_end_offsets,
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        DoubleBuffer<KeyT>& d_keys,
+                        int num_items,
+                        int num_segments,
+                        OffsetIteratorT d_begin_offsets,
+                        OffsetIteratorT d_end_offsets,
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  const KeyT * d_keys_in,
+                                  KeyT * d_keys_out,
+                                  int num_items,
+                                  int num_segments,
+                                  OffsetIteratorT d_begin_offsets,
+                                  OffsetIteratorT d_end_offsets,
+                                  int begin_bit = 0,
+                                  int end_bit = sizeof(KeyT) * 8,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  DoubleBuffer<KeyT>& d_keys,
+                                  int num_items,
+                                  int num_segments,
+                                  OffsetIteratorT d_begin_offsets,
+                                  OffsetIteratorT d_end_offsets,
+                                  int begin_bit = 0,
+                                  int end_bit = sizeof(KeyT) * 8,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
--- a/3rdparty/cub/device/device_segmented_reduce.cuh
+++ b/3rdparty/cub/device/device_segmented_reduce.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
+
+#include <limits>
+#include <iterator>
+
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+#include "../iterator/arg_index_input_iterator.cuh"
+
+#include <cub/rocprim/device/device_segmented_reduce.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceSegmentedReduce
+{
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT,
+        typename ReductionOp,
+        typename T
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Reduce(void * d_temp_storage,
+                      size_t& temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_segments,
+                      OffsetIteratorT d_begin_offsets,
+                      OffsetIteratorT d_end_offsets,
+                      ReductionOp reduction_op,
+                      T initial_value,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
+            initial_value,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Sum(void * d_temp_storage,
+                   size_t& temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_segments,
+                   OffsetIteratorT d_begin_offsets,
+                   OffsetIteratorT d_end_offsets,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::Sum(), input_type(),
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Min(void * d_temp_storage,
+                   size_t& temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_segments,
+                   OffsetIteratorT d_begin_offsets,
+                   OffsetIteratorT d_end_offsets,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::Min(), std::numeric_limits<input_type>::max(),
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ArgMin(void * d_temp_storage,
+                      size_t& temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_segments,
+                      OffsetIteratorT d_begin_offsets,
+                      OffsetIteratorT d_end_offsets,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        using OffsetT = int;
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        using O = typename std::iterator_traits<OutputIteratorT>::value_type;
+        using OutputTupleT = typename std::conditional<
+                                 std::is_same<O, void>::value,
+                                 KeyValuePair<OffsetT, T>,
+                                 O
+                             >::type;
+
+        using OutputValueT = typename OutputTupleT::Value;
+        using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        IteratorT d_indexed_in(d_in);
+        const OutputTupleT init(1, std::numeric_limits<T>::max());
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_indexed_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::ArgMin(), init,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Max(void * d_temp_storage,
+                   size_t& temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_segments,
+                   OffsetIteratorT d_begin_offsets,
+                   OffsetIteratorT d_end_offsets,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::Max(), std::numeric_limits<input_type>::lowest(),
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ArgMax(void * d_temp_storage,
+                      size_t& temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_segments,
+                      OffsetIteratorT d_begin_offsets,
+                      OffsetIteratorT d_end_offsets,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        using OffsetT = int;
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        using O = typename std::iterator_traits<OutputIteratorT>::value_type;
+        using OutputTupleT = typename std::conditional<
+                                 std::is_same<O, void>::value,
+                                 KeyValuePair<OffsetT, T>,
+                                 O
+                             >::type;
+
+        using OutputValueT = typename OutputTupleT::Value;
+        using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        IteratorT d_indexed_in(d_in);
+        const OutputTupleT init(1, std::numeric_limits<T>::lowest());
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_indexed_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::ArgMax(), init,
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
--- a/3rdparty/cub/device/device_segmented_sort.hpp
+++ b/3rdparty/cub/device/device_segmented_sort.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_segmented_radix_sort.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceSegmentedSort
+{
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         const KeyT * d_keys_in,
+                         KeyT * d_keys_out,
+                         const ValueT * d_values_in,
+                         ValueT * d_values_out,
+                         int num_items,
+                         int num_segments,
+                         OffsetIteratorT d_begin_offsets,
+                         OffsetIteratorT d_end_offsets,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         DoubleBuffer<KeyT>& d_keys,
+                         DoubleBuffer<ValueT>& d_values,
+                         int num_items,
+                         int num_segments,
+                         OffsetIteratorT d_begin_offsets,
+                         OffsetIteratorT d_end_offsets,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   const KeyT * d_keys_in,
+                                   KeyT * d_keys_out,
+                                   const ValueT * d_values_in,
+                                   ValueT * d_values_out,
+                                   int num_items,
+                                   int num_segments,
+                                   OffsetIteratorT d_begin_offsets,
+                                   OffsetIteratorT d_end_offsets,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   DoubleBuffer<KeyT>& d_keys,
+                                   DoubleBuffer<ValueT>& d_values,
+                                   int num_items,
+                                   int num_segments,
+                                   OffsetIteratorT d_begin_offsets,
+                                   OffsetIteratorT d_end_offsets,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        const KeyT * d_keys_in,
+                        KeyT * d_keys_out,
+                        int num_items,
+                        int num_segments,
+                        OffsetIteratorT d_begin_offsets,
+                        OffsetIteratorT d_end_offsets,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        DoubleBuffer<KeyT>& d_keys,
+                        int num_items,
+                        int num_segments,
+                        OffsetIteratorT d_begin_offsets,
+                        OffsetIteratorT d_end_offsets,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  const KeyT * d_keys_in,
+                                  KeyT * d_keys_out,
+                                  int num_items,
+                                  int num_segments,
+                                  OffsetIteratorT d_begin_offsets,
+                                  OffsetIteratorT d_end_offsets,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  DoubleBuffer<KeyT>& d_keys,
+                                  int num_items,
+                                  int num_segments,
+                                  OffsetIteratorT d_begin_offsets,
+                                  OffsetIteratorT d_end_offsets,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortPairs(void * d_temp_storage,
+                               size_t& temp_storage_bytes,
+                               const KeyT * d_keys_in,
+                               KeyT * d_keys_out,
+                               const ValueT * d_values_in,
+                               ValueT * d_values_out,
+                               int num_items,
+                               int num_segments,
+                               OffsetIteratorT d_begin_offsets,
+                               OffsetIteratorT d_end_offsets,
+                               cudaStream_t stream = 0,
+                               bool debug_synchronous = false)
+    {
+        return SortPairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortPairs(void * d_temp_storage,
+                               size_t& temp_storage_bytes,
+                               DoubleBuffer<KeyT>& d_keys,
+                               DoubleBuffer<ValueT>& d_values,
+                               int num_items,
+                               int num_segments,
+                               OffsetIteratorT d_begin_offsets,
+                               OffsetIteratorT d_end_offsets,
+                               cudaStream_t stream = 0,
+                               bool debug_synchronous = false)
+    {
+        return SortPairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, d_values, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortPairsDescending(void * d_temp_storage,
+                                         size_t& temp_storage_bytes,
+                                         const KeyT * d_keys_in,
+                                         KeyT * d_keys_out,
+                                         const ValueT * d_values_in,
+                                         ValueT * d_values_out,
+                                         int num_items,
+                                         int num_segments,
+                                         OffsetIteratorT d_begin_offsets,
+                                         OffsetIteratorT d_end_offsets,
+                                         cudaStream_t stream = 0,
+                                         bool debug_synchronous = false)
+    {
+        return SortPairsDescending(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortPairsDescending(void * d_temp_storage,
+                                         size_t& temp_storage_bytes,
+                                         DoubleBuffer<KeyT>& d_keys,
+                                         DoubleBuffer<ValueT>& d_values,
+                                         int num_items,
+                                         int num_segments,
+                                         OffsetIteratorT d_begin_offsets,
+                                         OffsetIteratorT d_end_offsets,
+                                         cudaStream_t stream = 0,
+                                         bool debug_synchronous = false)
+    {
+        return SortPairsDescending(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, d_values, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortKeys(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              const KeyT * d_keys_in,
+                              KeyT * d_keys_out,
+                              int num_items,
+                              int num_segments,
+                              OffsetIteratorT d_begin_offsets,
+                              OffsetIteratorT d_end_offsets,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return SortKeys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortKeys(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              DoubleBuffer<KeyT>& d_keys,
+                              int num_items,
+                              int num_segments,
+                              OffsetIteratorT d_begin_offsets,
+                              OffsetIteratorT d_end_offsets,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return SortKeys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortKeysDescending(void * d_temp_storage,
+                                        size_t& temp_storage_bytes,
+                                        const KeyT * d_keys_in,
+                                        KeyT * d_keys_out,
+                                        int num_items,
+                                        int num_segments,
+                                        OffsetIteratorT d_begin_offsets,
+                                        OffsetIteratorT d_end_offsets,
+                                        cudaStream_t stream = 0,
+                                        bool debug_synchronous = false)
+    {
+        return SortKeysDescending(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortKeysDescending(void * d_temp_storage,
+                                        size_t& temp_storage_bytes,
+                                        DoubleBuffer<KeyT>& d_keys,
+                                        int num_items,
+                                        int num_segments,
+                                        OffsetIteratorT d_begin_offsets,
+                                        OffsetIteratorT d_end_offsets,
+                                        cudaStream_t stream = 0,
+                                        bool debug_synchronous = false)
+    {
+        return SortKeysDescending(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
--- a/3rdparty/cub/device/device_select.cuh
+++ b/3rdparty/cub/device/device_select.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
+
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/device/device_select.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+class DeviceSelect
+{
+public:
+    template <
+        typename InputIteratorT,
+        typename FlagIterator,
+        typename OutputIteratorT,
+        typename NumSelectedIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Flagged(void *d_temp_storage,
+                       size_t &temp_storage_bytes,
+                       InputIteratorT d_in,
+                       FlagIterator d_flags,
+                       OutputIteratorT d_out,
+                       NumSelectedIteratorT d_num_selected_out,
+                       int num_items,
+                       cudaStream_t stream = 0,
+                       bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::select(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_flags, d_out, d_num_selected_out, num_items,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename NumSelectedIteratorT,
+        typename SelectOp
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t If(void *d_temp_storage,
+                  size_t &temp_storage_bytes,
+                  InputIteratorT d_in,
+                  OutputIteratorT d_out,
+                  NumSelectedIteratorT d_num_selected_out,
+                  int num_items,
+                  SelectOp select_op,
+                  cudaStream_t stream = 0,
+                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::select(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, d_num_selected_out, num_items, select_op,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename NumSelectedIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Unique(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      NumSelectedIteratorT d_num_selected_out,
+                      int num_items,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::unique(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, d_num_selected_out, num_items, cub::Equality(),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeyIteratorT,
+        typename ValueIteratorT,
+        typename OutputKeyIteratorT,
+        typename OutputValueIteratorT,
+        typename NumSelectedIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t UniqueByKey(void *d_temp_storage,
+                           size_t &temp_storage_bytes,
+                           KeyIteratorT d_keys_input,
+                           ValueIteratorT d_values_input,
+                           OutputKeyIteratorT d_keys_output,
+                           OutputValueIteratorT d_values_output,
+                           NumSelectedIteratorT d_num_selected_out,
+                           int num_items,
+                           cudaStream_t stream = 0,
+                           bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::unique_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_input, d_values_input, 
+            d_keys_output, d_values_output,
+            d_num_selected_out, num_items, cub::Equality(),
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_