添加dtk中的cub头文件

f8a481f8 · zhouxiang · 7b7c64c5 · 7b7c64c5 · f8a481f8 · f8a481f8
Commit f8a481f8 authored Oct 13, 2023 by zhouxiang
20 changed files
--- a/3rdparty/cub
+++ b/3rdparty/cub
-/opt/dtk-23.04/cuda/include/cub
\ No newline at end of file
--- a/3rdparty/cub/block/block_adjacent_difference.cuh
+++ b/3rdparty/cub/block/block_adjacent_difference.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/block/block_adjacent_difference.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+namespace detail
+{
+  // Trait checks if FlagOp can be called with 3 arguments (a, b, b_index)
+  template<class T, class FlagOp, class = void>
+  struct WithBIndexArg
+      : std::false_type
+  { };
+
+  template<class T, class FlagOp>
+  struct WithBIndexArg<
+          T, FlagOp,
+          typename std::conditional<
+             true,
+             void,
+             decltype(std::declval<FlagOp>()(std::declval<T>(), std::declval<T>(), 0))
+          >::type
+      > : std::true_type
+  { };
+
+}
+
+template<
+    typename T,
+    int BLOCK_DIM_X,
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockAdjacentDifference
+    : private ::rocprim::block_adjacent_difference<
+        T,
+        BLOCK_DIM_X,
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_adjacent_difference<
+            T,
+            BLOCK_DIM_X,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockAdjacentDifference() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockAdjacentDifference(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    [[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
+    HIPCUB_DEVICE inline
+    void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                   T (&input)[ITEMS_PER_THREAD],
+                   FlagOp flag_op)
+    {
+        HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
+        base_type::flag_heads(head_flags, input, flag_op, temp_storage_);
+        HIPCUB_CLANG_SUPPRESS_WARNING_POP
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    [[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
+    HIPCUB_DEVICE inline
+    void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                   T (&input)[ITEMS_PER_THREAD],
+                   FlagOp flag_op,
+                   T tile_predecessor_item)
+    {
+        HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
+        base_type::flag_heads(head_flags, tile_predecessor_item, input, flag_op, temp_storage_);
+        HIPCUB_CLANG_SUPPRESS_WARNING_POP
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    [[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
+    HIPCUB_DEVICE inline
+    void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                   T (&input)[ITEMS_PER_THREAD],
+                   FlagOp flag_op)
+    {
+        HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
+        base_type::flag_tails(tail_flags, input, flag_op, temp_storage_);
+        HIPCUB_CLANG_SUPPRESS_WARNING_POP
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    [[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
+    HIPCUB_DEVICE inline
+    void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                   T (&input)[ITEMS_PER_THREAD],
+                   FlagOp flag_op,
+                   T tile_successor_item)
+    {
+        HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
+        base_type::flag_tails(tail_flags, tile_successor_item, input, flag_op, temp_storage_);
+        HIPCUB_CLANG_SUPPRESS_WARNING_POP
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    [[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
+    HIPCUB_DEVICE inline
+    void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                           FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                           T (&input)[ITEMS_PER_THREAD],
+                           FlagOp flag_op)
+    {
+        HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
+        base_type::flag_heads_and_tails(
+            head_flags, tail_flags, input,
+            flag_op, temp_storage_
+        );
+        HIPCUB_CLANG_SUPPRESS_WARNING_POP
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    [[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
+    HIPCUB_DEVICE inline
+    void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                           FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                           T tile_successor_item,
+                           T (&input)[ITEMS_PER_THREAD],
+                           FlagOp flag_op)
+    {
+        HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
+        base_type::flag_heads_and_tails(
+            head_flags, tail_flags, tile_successor_item, input,
+            flag_op, temp_storage_
+        );
+        HIPCUB_CLANG_SUPPRESS_WARNING_POP
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    [[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
+    HIPCUB_DEVICE inline
+    void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                           T tile_predecessor_item,
+                           FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                           T (&input)[ITEMS_PER_THREAD],
+                           FlagOp flag_op)
+    {
+        HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
+        base_type::flag_heads_and_tails(
+            head_flags, tile_predecessor_item, tail_flags, input,
+            flag_op, temp_storage_
+        );
+        HIPCUB_CLANG_SUPPRESS_WARNING_POP
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    [[deprecated("The Flags API of BlockAdjacentDifference is deprecated.")]]
+    HIPCUB_DEVICE inline
+    void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                           T tile_predecessor_item,
+                           FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                           T tile_successor_item,
+                           T (&input)[ITEMS_PER_THREAD],
+                           FlagOp flag_op)
+    {
+        HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated")
+        base_type::flag_heads_and_tails(
+            head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input,
+            flag_op, temp_storage_
+        );
+        HIPCUB_CLANG_SUPPRESS_WARNING_POP
+    }
+
+    template <int ITEMS_PER_THREAD, typename OutputType, typename DifferenceOpT>
+    HIPCUB_DEVICE inline
+    void SubtractLeft(T (&input)[ITEMS_PER_THREAD],
+                      OutputType (&output)[ITEMS_PER_THREAD],
+                      DifferenceOpT difference_op)
+    {
+        base_type::subtract_left(
+            input, output, difference_op, temp_storage_
+        );
+    }
+
+    template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
+    HIPCUB_DEVICE inline
+    void SubtractLeft(T (&input)[ITEMS_PER_THREAD],
+                      OutputT (&output)[ITEMS_PER_THREAD],
+                      DifferenceOpT difference_op,
+                      T tile_predecessor_item)
+    {
+        base_type::subtract_left(
+            input, output, difference_op, tile_predecessor_item, temp_storage_
+        );
+    }
+
+    template <int ITEMS_PER_THREAD, typename OutputType, typename DifferenceOpT>
+    HIPCUB_DEVICE inline
+    void SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
+                                OutputType (&output)[ITEMS_PER_THREAD],
+                                DifferenceOpT difference_op,
+                                int valid_items)
+    {
+        base_type::subtract_left_partial(
+            input, output, difference_op, valid_items, temp_storage_
+        );
+    }
+
+    template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
+    HIPCUB_DEVICE inline
+    void SubtractRight(T (&input)[ITEMS_PER_THREAD],
+                      OutputT (&output)[ITEMS_PER_THREAD],
+                      DifferenceOpT difference_op)
+    {
+        base_type::subtract_right(
+            input, output, difference_op, temp_storage_
+        );
+    }
+
+    template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
+    HIPCUB_DEVICE inline
+    void SubtractRight(T (&input)[ITEMS_PER_THREAD],
+                      OutputT (&output)[ITEMS_PER_THREAD],
+                      DifferenceOpT difference_op,
+                      T tile_successor_item)
+    {
+        base_type::subtract_right(
+            input, output, difference_op, tile_successor_item, temp_storage_
+        );
+    }
+
+    template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
+    HIPCUB_DEVICE inline
+    void SubtractRightPartialTile(T (&input)[ITEMS_PER_THREAD],
+                                  OutputT (&output)[ITEMS_PER_THREAD],
+                                  DifferenceOpT difference_op,
+                                  int valid_items)
+    {
+        base_type::subtract_right_partial(
+            input, output, difference_op, valid_items, temp_storage_
+        );
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
--- a/3rdparty/cub/block/block_discontinuity.cuh
+++ b/3rdparty/cub/block/block_discontinuity.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/block/block_discontinuity.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+template<
+    typename T,
+    int BLOCK_DIM_X,
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockDiscontinuity
+    : private ::rocprim::block_discontinuity<
+        T,
+        BLOCK_DIM_X,
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_discontinuity<
+            T,
+            BLOCK_DIM_X,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockDiscontinuity() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockDiscontinuity(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    HIPCUB_DEVICE inline
+    void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                   T (&input)[ITEMS_PER_THREAD],
+                   FlagOp flag_op)
+    {
+        base_type::flag_heads(head_flags, input, flag_op, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    HIPCUB_DEVICE inline
+    void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                   T (&input)[ITEMS_PER_THREAD],
+                   FlagOp flag_op,
+                   T tile_predecessor_item)
+    {
+        base_type::flag_heads(head_flags, tile_predecessor_item, input, flag_op, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    HIPCUB_DEVICE inline
+    void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                   T (&input)[ITEMS_PER_THREAD],
+                   FlagOp flag_op)
+    {
+        base_type::flag_tails(tail_flags, input, flag_op, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    HIPCUB_DEVICE inline
+    void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                   T (&input)[ITEMS_PER_THREAD],
+                   FlagOp flag_op,
+                   T tile_successor_item)
+    {
+        base_type::flag_tails(tail_flags, tile_successor_item, input, flag_op, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    HIPCUB_DEVICE inline
+    void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                           FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                           T (&input)[ITEMS_PER_THREAD],
+                           FlagOp flag_op)
+    {
+        base_type::flag_heads_and_tails(
+            head_flags, tail_flags, input,
+            flag_op, temp_storage_
+        );
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    HIPCUB_DEVICE inline
+    void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                           FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                           T tile_successor_item,
+                           T (&input)[ITEMS_PER_THREAD],
+                           FlagOp flag_op)
+    {
+        base_type::flag_heads_and_tails(
+            head_flags, tail_flags, tile_successor_item, input,
+            flag_op, temp_storage_
+        );
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    HIPCUB_DEVICE inline
+    void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                           T tile_predecessor_item,
+                           FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                           T (&input)[ITEMS_PER_THREAD],
+                           FlagOp flag_op)
+    {
+        base_type::flag_heads_and_tails(
+            head_flags, tile_predecessor_item, tail_flags, input,
+            flag_op, temp_storage_
+        );
+    }
+
+    template<int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    HIPCUB_DEVICE inline
+    void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                           T tile_predecessor_item,
+                           FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                           T tile_successor_item,
+                           T (&input)[ITEMS_PER_THREAD],
+                           FlagOp flag_op)
+    {
+        base_type::flag_heads_and_tails(
+            head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input,
+            flag_op, temp_storage_
+        );
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
--- a/3rdparty/cub/block/block_exchange.cuh
+++ b/3rdparty/cub/block/block_exchange.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/block/block_exchange.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+template<
+    typename InputT,
+    int BLOCK_DIM_X,
+    int ITEMS_PER_THREAD,
+    bool WARP_TIME_SLICING = false, /* ignored */
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockExchange
+    : private ::rocprim::block_exchange<
+        InputT,
+        BLOCK_DIM_X,
+        ITEMS_PER_THREAD,
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_exchange<
+            InputT,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockExchange() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockExchange(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    template<typename OutputT>
+    HIPCUB_DEVICE inline
+    void StripedToBlocked(InputT  (&input_items)[ITEMS_PER_THREAD],
+                          OutputT (&output_items)[ITEMS_PER_THREAD])
+    {
+        base_type::striped_to_blocked(input_items, output_items, temp_storage_);
+    }
+
+    template<typename OutputT>
+    HIPCUB_DEVICE inline
+    void BlockedToStriped(InputT  (&input_items)[ITEMS_PER_THREAD],
+                          OutputT (&output_items)[ITEMS_PER_THREAD])
+    {
+        base_type::blocked_to_striped(input_items, output_items, temp_storage_);
+    }
+
+    template<typename OutputT>
+    HIPCUB_DEVICE inline
+    void WarpStripedToBlocked(InputT  (&input_items)[ITEMS_PER_THREAD],
+                              OutputT (&output_items)[ITEMS_PER_THREAD])
+    {
+        base_type::warp_striped_to_blocked(input_items, output_items, temp_storage_);
+    }
+
+    template<typename OutputT>
+    HIPCUB_DEVICE inline
+    void BlockedToWarpStriped(InputT  (&input_items)[ITEMS_PER_THREAD],
+                              OutputT (&output_items)[ITEMS_PER_THREAD])
+    {
+        base_type::blocked_to_warp_striped(input_items, output_items, temp_storage_);
+    }
+
+    template<typename OutputT, typename OffsetT>
+    HIPCUB_DEVICE inline
+    void ScatterToBlocked(InputT  (&input_items)[ITEMS_PER_THREAD],
+                          OutputT (&output_items)[ITEMS_PER_THREAD],
+                          OffsetT (&ranks)[ITEMS_PER_THREAD])
+    {
+        base_type::scatter_to_blocked(input_items, output_items, ranks, temp_storage_);
+    }
+
+    template<typename OutputT, typename OffsetT>
+    HIPCUB_DEVICE inline
+    void ScatterToStriped(InputT  (&input_items)[ITEMS_PER_THREAD],
+                          OutputT (&output_items)[ITEMS_PER_THREAD],
+                          OffsetT (&ranks)[ITEMS_PER_THREAD])
+    {
+        base_type::scatter_to_striped(input_items, output_items, ranks, temp_storage_);
+    }
+
+    template<typename OutputT, typename OffsetT>
+    HIPCUB_DEVICE inline
+    void ScatterToStripedGuarded(InputT  (&input_items)[ITEMS_PER_THREAD],
+                                 OutputT (&output_items)[ITEMS_PER_THREAD],
+                                 OffsetT (&ranks)[ITEMS_PER_THREAD])
+    {
+        base_type::scatter_to_striped_guarded(input_items, output_items, ranks, temp_storage_);
+    }
+
+    template<typename OutputT, typename OffsetT, typename ValidFlag>
+    HIPCUB_DEVICE inline
+    void ScatterToStripedFlagged(InputT    (&input_items)[ITEMS_PER_THREAD],
+                                 OutputT   (&output_items)[ITEMS_PER_THREAD],
+                                 OffsetT   (&ranks)[ITEMS_PER_THREAD],
+                                 ValidFlag (&is_valid)[ITEMS_PER_THREAD])
+    {
+        base_type::scatter_to_striped_flagged(input_items, output_items, ranks, is_valid, temp_storage_);
+    }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    HIPCUB_DEVICE inline void StripedToBlocked(
+        InputT      (&items)[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, items);
+    }
+
+    HIPCUB_DEVICE inline void BlockedToStriped(
+        InputT      (&items)[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(items, items);
+    }
+
+    HIPCUB_DEVICE inline void WarpStripedToBlocked(
+        InputT      (&items)[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, items);
+    }
+
+    HIPCUB_DEVICE inline void BlockedToWarpStriped(
+        InputT      (&items)[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(items, items);
+    }
+
+    template <typename OffsetT>
+    HIPCUB_DEVICE inline void ScatterToBlocked(
+        InputT      (&items)[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    HIPCUB_DEVICE inline void ScatterToStriped(
+        InputT      (&items)[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    HIPCUB_DEVICE inline void ScatterToStripedGuarded(
+        InputT      (&items)[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStripedGuarded(items, items, ranks);
+    }
+
+    template <typename OffsetT, typename ValidFlag>
+    HIPCUB_DEVICE inline void ScatterToStripedFlagged(
+        InputT      (&items)[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        ValidFlag   (&is_valid)[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    {
+        ScatterToStriped(items, items, ranks, is_valid);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
--- a/3rdparty/cub/block/block_histogram.cuh
+++ b/3rdparty/cub/block/block_histogram.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
+
+#include <type_traits>
+
+#include <cub/rocprim/block/block_histogram.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+namespace detail
+{
+    inline constexpr
+    typename std::underlying_type<::rocprim::block_histogram_algorithm>::type
+    to_BlockHistogramAlgorithm_enum(::rocprim::block_histogram_algorithm v)
+    {
+        using utype = std::underlying_type<::rocprim::block_histogram_algorithm>::type;
+        return static_cast<utype>(v);
+    }
+}
+
+enum BlockHistogramAlgorithm
+{
+    BLOCK_HISTO_ATOMIC
+        = detail::to_BlockHistogramAlgorithm_enum(::rocprim::block_histogram_algorithm::using_atomic),
+    BLOCK_HISTO_SORT
+        = detail::to_BlockHistogramAlgorithm_enum(::rocprim::block_histogram_algorithm::using_sort)
+};
+
+template<
+    typename T,
+    int BLOCK_DIM_X,
+    int ITEMS_PER_THREAD,
+    int BINS,
+    BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT,
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockHistogram
+    : private ::rocprim::block_histogram<
+        T,
+        BLOCK_DIM_X,
+        ITEMS_PER_THREAD,
+        BINS,
+        static_cast<::rocprim::block_histogram_algorithm>(ALGORITHM),
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_histogram<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            BINS,
+            static_cast<::rocprim::block_histogram_algorithm>(ALGORITHM),
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockHistogram() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockHistogram(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    template<class CounterT>
+    HIPCUB_DEVICE inline
+    void InitHistogram(CounterT histogram[BINS])
+    {
+        base_type::init_histogram(histogram);
+    }
+
+    template<class CounterT>
+    HIPCUB_DEVICE inline
+    void Composite(T (&items)[ITEMS_PER_THREAD],
+                   CounterT histogram[BINS])
+    {
+        base_type::composite(items, histogram, temp_storage_);
+    }
+
+    template<class CounterT>
+    HIPCUB_DEVICE inline
+    void Histogram(T (&items)[ITEMS_PER_THREAD],
+                   CounterT histogram[BINS])
+    {
+        base_type::init_histogram(histogram);
+        CTA_SYNC();
+        base_type::composite(items, histogram, temp_storage_);
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
--- a/3rdparty/cub/block/block_load.cuh
+++ b/3rdparty/cub/block/block_load.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+#include <cub/rocprim/block/block_load.hpp>
+
+#include "block_load_func.cuh"
+
+BEGIN_HIPCUB_NAMESPACE
+
+namespace detail
+{
+    inline constexpr
+    typename std::underlying_type<::rocprim::block_load_method>::type
+    to_BlockLoadAlgorithm_enum(::rocprim::block_load_method v)
+    {
+        using utype = std::underlying_type<::rocprim::block_load_method>::type;
+        return static_cast<utype>(v);
+    }
+}
+
+enum BlockLoadAlgorithm
+{
+    BLOCK_LOAD_DIRECT
+        = detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_direct),
+    BLOCK_LOAD_STRIPED
+        = detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_striped),
+    BLOCK_LOAD_VECTORIZE
+        = detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_vectorize),
+    BLOCK_LOAD_TRANSPOSE
+        = detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_transpose),
+    BLOCK_LOAD_WARP_TRANSPOSE
+        = detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_warp_transpose),
+    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
+        = detail::to_BlockLoadAlgorithm_enum(::rocprim::block_load_method::block_load_warp_transpose)
+};
+
+template<
+    typename T,
+    int BLOCK_DIM_X,
+    int ITEMS_PER_THREAD,
+    BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockLoad
+    : private ::rocprim::block_load<
+        T,
+        BLOCK_DIM_X,
+        ITEMS_PER_THREAD,
+        static_cast<::rocprim::block_load_method>(ALGORITHM),
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_load<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            static_cast<::rocprim::block_load_method>(ALGORITHM),
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockLoad() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockLoad(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    template<class InputIteratorT>
+    HIPCUB_DEVICE inline
+    void Load(InputIteratorT block_iter,
+              T (&items)[ITEMS_PER_THREAD])
+    {
+        base_type::load(block_iter, items, temp_storage_);
+    }
+
+    template<class InputIteratorT>
+    HIPCUB_DEVICE inline
+    void Load(InputIteratorT block_iter,
+              T (&items)[ITEMS_PER_THREAD],
+              int valid_items)
+    {
+        base_type::load(block_iter, items, valid_items, temp_storage_);
+    }
+
+    template<
+        class InputIteratorT,
+        class Default
+    >
+    HIPCUB_DEVICE inline
+    void Load(InputIteratorT block_iter,
+              T (&items)[ITEMS_PER_THREAD],
+              int valid_items,
+              Default oob_default)
+    {
+        base_type::load(block_iter, items, valid_items, oob_default, temp_storage_);
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
--- a/3rdparty/cub/block/block_load_func.cuh
+++ b/3rdparty/cub/block/block_load_func.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/block/block_load_func.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename InputIteratorT
+>
+HIPCUB_DEVICE inline
+void LoadDirectBlocked(int linear_id,
+                       InputIteratorT block_iter,
+                       T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_load_direct_blocked(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename InputIteratorT
+>
+HIPCUB_DEVICE inline
+void LoadDirectBlocked(int linear_id,
+                       InputIteratorT block_iter,
+                       T (&items)[ITEMS_PER_THREAD],
+                       int valid_items)
+{
+    ::rocprim::block_load_direct_blocked(
+        linear_id, block_iter, items, valid_items
+    );
+}
+
+template<
+    typename T,
+    typename Default,
+    int ITEMS_PER_THREAD,
+    typename InputIteratorT
+>
+HIPCUB_DEVICE inline
+void LoadDirectBlocked(int linear_id,
+                       InputIteratorT block_iter,
+                       T (&items)[ITEMS_PER_THREAD],
+                       int valid_items,
+                       Default oob_default)
+{
+    ::rocprim::block_load_direct_blocked(
+        linear_id, block_iter, items, valid_items, oob_default
+    );
+}
+
+template <
+    typename T,
+    int ITEMS_PER_THREAD
+>
+HIPCUB_DEVICE inline
+void LoadDirectBlockedVectorized(int linear_id,
+                                 T* block_iter,
+                                 T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_load_direct_blocked_vectorized(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    int BLOCK_THREADS,
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename InputIteratorT
+>
+HIPCUB_DEVICE inline
+void LoadDirectStriped(int linear_id,
+                       InputIteratorT block_iter,
+                       T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_load_direct_striped<BLOCK_THREADS>(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    int BLOCK_THREADS,
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename InputIteratorT
+>
+HIPCUB_DEVICE inline
+void LoadDirectStriped(int linear_id,
+                       InputIteratorT block_iter,
+                       T (&items)[ITEMS_PER_THREAD],
+                       int valid_items)
+{
+    ::rocprim::block_load_direct_striped<BLOCK_THREADS>(
+        linear_id, block_iter, items, valid_items
+    );
+}
+
+template<
+    int BLOCK_THREADS,
+    typename T,
+    typename Default,
+    int ITEMS_PER_THREAD,
+    typename InputIteratorT
+>
+HIPCUB_DEVICE inline
+void LoadDirectStriped(int linear_id,
+                       InputIteratorT block_iter,
+                       T (&items)[ITEMS_PER_THREAD],
+                       int valid_items,
+                       Default oob_default)
+{
+    ::rocprim::block_load_direct_striped<BLOCK_THREADS>(
+        linear_id, block_iter, items, valid_items, oob_default
+    );
+}
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename InputIteratorT
+>
+HIPCUB_DEVICE inline
+void LoadDirectWarpStriped(int linear_id,
+                           InputIteratorT block_iter,
+                           T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_load_direct_warp_striped(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename InputIteratorT
+>
+HIPCUB_DEVICE inline
+void LoadDirectWarpStriped(int linear_id,
+                           InputIteratorT block_iter,
+                           T (&items)[ITEMS_PER_THREAD],
+                           int valid_items)
+{
+    ::rocprim::block_load_direct_warp_striped(
+        linear_id, block_iter, items, valid_items
+    );
+}
+
+template<
+    typename T,
+    typename Default,
+    int ITEMS_PER_THREAD,
+    typename InputIteratorT
+>
+HIPCUB_DEVICE inline
+void LoadDirectWarpStriped(int linear_id,
+                           InputIteratorT block_iter,
+                           T (&items)[ITEMS_PER_THREAD],
+                           int valid_items,
+                           Default oob_default)
+{
+    ::rocprim::block_load_direct_warp_striped(
+        linear_id, block_iter, items, valid_items, oob_default
+    );
+}
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
--- a/3rdparty/cub/block/block_merge_sort.hpp
+++ b/3rdparty/cub/block/block_merge_sort.hpp
+/******************************************************************************
+* Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_MERGE_SORT_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_MERGE_SORT_HPP_
+
+#include "../thread/thread_sort.hpp"
+#include "../util_math.cuh"
+#include "../util_type.cuh"
+
+#include <cub/rocprim/detail/various.hpp>
+#include <cub/rocprim/functional.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+
+// Additional details of the Merge-Path Algorithm can be found in:
+// S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, " Merge Path - Parallel
+// Merging Made Simple", Multithreaded Architectures and Applications (MTAAP)
+// Workshop, IEEE 26th International Parallel & Distributed Processing
+// Symposium (IPDPS), 2012
+template <typename KeyT,
+        typename KeyIteratorT,
+        typename OffsetT,
+        typename BinaryPred>
+HIPCUB_DEVICE __forceinline__ OffsetT MergePath(KeyIteratorT keys1,
+                                            KeyIteratorT keys2,
+                                            OffsetT keys1_count,
+                                            OffsetT keys2_count,
+                                            OffsetT diag,
+                                            BinaryPred binary_pred)
+{
+   OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count;
+   OffsetT keys1_end   = (::rocprim::min)(diag, keys1_count);
+
+   while (keys1_begin < keys1_end)
+   {
+       OffsetT mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
+       KeyT key1   = keys1[mid];
+       KeyT key2   = keys2[diag - 1 - mid];
+       bool pred   = binary_pred(key2, key1);
+
+       if (pred)
+       {
+           keys1_end = mid;
+       }
+       else
+       {
+           keys1_begin = mid + 1;
+       }
+   }
+   return keys1_begin;
+}
+
+template <typename KeyT, typename CompareOp, int ITEMS_PER_THREAD>
+HIPCUB_DEVICE __forceinline__ void SerialMerge(KeyT *keys_shared,
+                                           int keys1_beg,
+                                           int keys2_beg,
+                                           int keys1_count,
+                                           int keys2_count,
+                                           KeyT (&output)[ITEMS_PER_THREAD],
+                                           int (&indices)[ITEMS_PER_THREAD],
+                                           CompareOp compare_op)
+{
+   int keys1_end = keys1_beg + keys1_count;
+   int keys2_end = keys2_beg + keys2_count;
+
+   KeyT key1 = keys_shared[keys1_beg];
+   KeyT key2 = keys_shared[keys2_beg];
+
+#pragma unroll
+   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+   {
+       bool p = (keys2_beg < keys2_end) &&
+                ((keys1_beg >= keys1_end)
+                 || compare_op(key2, key1));
+
+       output[item]  = p ? key2 : key1;
+       indices[item] = p ? keys2_beg++ : keys1_beg++;
+
+       if (p)
+       {
+           key2 = keys_shared[keys2_beg];
+       }
+       else
+       {
+           key1 = keys_shared[keys1_beg];
+       }
+   }
+}
+
+/**
+ * @brief Generalized merge sort algorithm
+ *
+ * This class is used to reduce code duplication. Warp and Block merge sort
+ * differ only in how they compute thread index and how they synchronize
+ * threads. Since synchronization might require access to custom data
+ * (like member mask), CRTP is used.
+ *
+ * @par
+ * The code snippet below illustrates the way this class can be used.
+ * @par
+ * @code
+ * #include <hipcub/hipcub.hpp> // or equivalently <hipcub/block/block_merge_sort.hpp>
+ *
+ * constexpr int BLOCK_THREADS = 256;
+ * constexpr int ITEMS_PER_THREAD = 9;
+ *
+ * class BlockMergeSort : public BlockMergeSortStrategy<int,
+ *                                                      cub::NullType,
+ *                                                      BLOCK_THREADS,
+ *                                                      ITEMS_PER_THREAD,
+ *                                                      BlockMergeSort>
+ * {
+ *   using BlockMergeSortStrategyT =
+ *     BlockMergeSortStrategy<int,
+ *                            cub::NullType,
+ *                            BLOCK_THREADS,
+ *                            ITEMS_PER_THREAD,
+ *                            BlockMergeSort>;
+ * public:
+ *   __device__ __forceinline__ explicit BlockMergeSort(
+ *     typename BlockMergeSortStrategyT::TempStorage &temp_storage)
+ *       : BlockMergeSortStrategyT(temp_storage, threadIdx.x)
+ *   {}
+ *
+ *   __device__ __forceinline__ void SyncImplementation() const
+ *   {
+ *     __syncthreads();
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam ValueT
+ *   ValueT type. cub::NullType indicates a keys-only sort
+ *
+ * @tparam SynchronizationPolicy
+ *   Provides a way of synchronizing threads. Should be derived from
+ *   `BlockMergeSortStrategy`.
+ */
+template <typename KeyT,
+          typename ValueT,
+          int NUM_THREADS,
+          int ITEMS_PER_THREAD,
+          typename SynchronizationPolicy>
+class BlockMergeSortStrategy
+{
+  static_assert(PowerOfTwo<NUM_THREADS>::VALUE,
+                "NUM_THREADS must be a power of two");
+
+private:
+
+  static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * NUM_THREADS;
+
+  // Whether or not there are values to be trucked along with keys
+  static constexpr bool KEYS_ONLY = ::rocprim::Equals<ValueT, NullType>::VALUE;
+
+  /// Shared memory type required by this thread block
+  union _TempStorage
+  {
+    KeyT keys_shared[ITEMS_PER_TILE + 1];
+    ValueT items_shared[ITEMS_PER_TILE + 1];
+  }; // union TempStorage
+
+  /// Shared storage reference
+  _TempStorage &temp_storage;
+
+  /// Internal storage allocator
+  HIPCUB_DEVICE __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  const unsigned int linear_tid;
+
+public:
+  /// \smemstorage{BlockMergeSort}
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  BlockMergeSortStrategy() = delete;
+  explicit HIPCUB_DEVICE __forceinline__
+  BlockMergeSortStrategy(unsigned int linear_tid)
+      : temp_storage(PrivateStorage())
+      , linear_tid(linear_tid)
+  {}
+
+  HIPCUB_DEVICE __forceinline__ BlockMergeSortStrategy(TempStorage &temp_storage,
+                                                    unsigned int linear_tid)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(linear_tid)
+  {}
+
+  HIPCUB_DEVICE __forceinline__ unsigned int get_linear_tid() const
+  {
+    return linear_tid;
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * Sort is not guaranteed to be stable. That is, suppose that i and j are
+   * equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  HIPCUB_DEVICE __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                       CompareOp compare_op)
+  {
+    ValueT items[ITEMS_PER_THREAD];
+    Sort<CompareOp, false>(keys, items, compare_op, ITEMS_PER_TILE, keys[0]);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j`
+   *   are equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
+   *   after `oob_default`, it won't be placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  HIPCUB_DEVICE __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                       CompareOp compare_op,
+                                       int valid_items,
+                                       KeyT oob_default)
+  {
+    ValueT items[ITEMS_PER_THREAD];
+    Sort<CompareOp, true>(keys, items, compare_op, valid_items, oob_default);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
+   *
+   * @par
+   * Sort is not guaranteed to be stable. That is, suppose that `i` and `j` are
+   * equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  HIPCUB_DEVICE __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                       ValueT (&items)[ITEMS_PER_THREAD],
+                                       CompareOp compare_op)
+  {
+    Sort<CompareOp, false>(keys, items, compare_op, ITEMS_PER_TILE, keys[0]);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j`
+   *   are equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
+   *   after `oob_default`, it won't be placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @tparam IS_LAST_TILE
+   *   True if `valid_items` isn't equal to the `ITEMS_PER_TILE`
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp,
+            bool IS_LAST_TILE = true>
+  HIPCUB_DEVICE __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                       ValueT (&items)[ITEMS_PER_THREAD],
+                                       CompareOp compare_op,
+                                       int valid_items,
+                                       KeyT oob_default)
+  {
+    if (IS_LAST_TILE)
+    {
+      // if last tile, find valid max_key
+      // and fill the remaining keys with it
+      //
+      KeyT max_key = oob_default;
+
+      #pragma unroll
+      for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+      {
+        if (ITEMS_PER_THREAD * static_cast<int>(linear_tid) + item < valid_items)
+        {
+          max_key = compare_op(max_key, keys[item]) ? keys[item] : max_key;
+        }
+        else
+        {
+          keys[item] = max_key;
+        }
+      }
+    }
+
+    // if first element of thread is in input range, stable sort items
+    //
+    if (!IS_LAST_TILE || ITEMS_PER_THREAD * static_cast<int>(linear_tid) < valid_items)
+    {
+      StableOddEvenSort(keys, items, compare_op);
+    }
+
+    // each thread has sorted keys
+    // merge sort keys in shared memory
+    //
+    #pragma unroll
+    for (int target_merged_threads_number = 2;
+         target_merged_threads_number <= NUM_THREADS;
+         target_merged_threads_number *= 2)
+    {
+      int merged_threads_number = target_merged_threads_number / 2;
+      int mask = target_merged_threads_number - 1;
+
+      Sync();
+
+      // store keys in shmem
+      //
+      #pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+      {
+        int idx                       = ITEMS_PER_THREAD * linear_tid + item;
+        temp_storage.keys_shared[idx] = keys[item];
+      }
+
+      Sync();
+
+      int indices[ITEMS_PER_THREAD];
+
+      int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid;
+      int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged;
+      int size  = ITEMS_PER_THREAD * merged_threads_number;
+
+      int thread_idx_in_thread_group_being_merged = mask & linear_tid;
+
+      int diag =
+        (::rocprim::min)(valid_items,
+                   ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
+
+      int keys1_beg = (::rocprim::min)(valid_items, start);
+      int keys1_end = (::rocprim::min)(valid_items, keys1_beg + size);
+      int keys2_beg = keys1_end;
+      int keys2_end = (::rocprim::min)(valid_items, keys2_beg + size);
+
+      int keys1_count = keys1_end - keys1_beg;
+      int keys2_count = keys2_end - keys2_beg;
+
+      int partition_diag = MergePath<KeyT>(&temp_storage.keys_shared[keys1_beg],
+                                           &temp_storage.keys_shared[keys2_beg],
+                                           keys1_count,
+                                           keys2_count,
+                                           diag,
+                                           compare_op);
+
+      int keys1_beg_loc   = keys1_beg + partition_diag;
+      int keys1_end_loc   = keys1_end;
+      int keys2_beg_loc   = keys2_beg + diag - partition_diag;
+      int keys2_end_loc   = keys2_end;
+      int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
+      int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
+      SerialMerge(&temp_storage.keys_shared[0],
+                  keys1_beg_loc,
+                  keys2_beg_loc,
+                  keys1_count_loc,
+                  keys2_count_loc,
+                  keys,
+                  indices,
+                  compare_op);
+
+      if (!KEYS_ONLY)
+      {
+        Sync();
+
+        // store keys in shmem
+        //
+        #pragma unroll
+        for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+        {
+          int idx = ITEMS_PER_THREAD * linear_tid + item;
+          temp_storage.items_shared[idx] = items[item];
+        }
+
+        Sync();
+
+        // gather items from shmem
+        //
+        #pragma unroll
+        for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+        {
+          items[item] = temp_storage.items_shared[indices[item]];
+        }
+      }
+    }
+  } // func block_merge_sort
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * StableSort is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
+   * and if the two elements are equivalent (neither `x < y` nor `y < x`) then
+   * a postcondition of StableSort is that `x` still precedes `y`.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  HIPCUB_DEVICE __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                             CompareOp compare_op)
+  {
+    Sort(keys, compare_op);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * StableSort is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
+   * and if the two elements are equivalent (neither `x < y` nor `y < x`) then
+   * a postcondition of StableSort is that `x` still precedes `y`.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  HIPCUB_DEVICE __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                             ValueT (&items)[ITEMS_PER_THREAD],
+                                             CompareOp compare_op)
+  {
+    Sort(keys, items, compare_op);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - StableSort is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if `x` and `y` are elements such that `x` precedes
+   *   `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`)
+   *   then a postcondition of StableSort is that `x` still precedes `y`.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`.
+   *   If there is a value that is ordered after `oob_default`, it won't be
+   *   placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  HIPCUB_DEVICE __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                             CompareOp compare_op,
+                                             int valid_items,
+                                             KeyT oob_default)
+  {
+    Sort(keys, compare_op, valid_items, oob_default);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - StableSort is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if `x` and `y` are elements such that `x` precedes
+   *   `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`)
+   *   then a postcondition of StableSort is that `x` still precedes `y`.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
+   *   after `oob_default`, it won't be placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @tparam IS_LAST_TILE
+   *   True if `valid_items` isn't equal to the `ITEMS_PER_TILE`
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp,
+            bool IS_LAST_TILE = true>
+  HIPCUB_DEVICE __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                             ValueT (&items)[ITEMS_PER_THREAD],
+                                             CompareOp compare_op,
+                                             int valid_items,
+                                             KeyT oob_default)
+  {
+    Sort<CompareOp, IS_LAST_TILE>(keys,
+                                  items,
+                                  compare_op,
+                                  valid_items,
+                                  oob_default);
+  }
+
+private:
+  HIPCUB_DEVICE __forceinline__ void Sync() const
+  {
+    static_cast<const SynchronizationPolicy*>(this)->SyncImplementation();
+  }
+};
+
+
+/**
+ * @brief The BlockMergeSort class provides methods for sorting items
+ *        partitioned across a CUDA thread block using a merge sorting method.
+ * @ingroup BlockModule
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of items per thread
+ *
+ * @tparam ValueT
+ *   **[optional]** ValueT type (default: `cub::NullType`, which indicates
+ *   a keys-only sort)
+ *
+ * @tparam BLOCK_DIM_Y
+ *   **[optional]** The thread block length in threads along the Y dimension
+ *   (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z
+ *   **[optional]** The thread block length in threads along the Z dimension
+ *   (default: 1)
+ *
+ * @par Overview
+ *   BlockMergeSort arranges items into ascending order using a comparison
+ *   functor with less-than semantics. Merge sort can handle arbitrary types
+ *   and comparison functors, but is slower than BlockRadixSort when sorting
+ *   arithmetic types into ascending/descending order.
+ *
+ * @par A Simple Example
+ * @blockcollective{BlockMergeSort}
+ * @par
+ * The code snippet below illustrates a sort of 512 integer keys that are
+ * partitioned across 128 threads * where each thread owns 4 consecutive items.
+ * @par
+ * @code
+ * #include <hipcub/hipcub.hpp>  // or equivalently <hipcub/block/block_merge_sort.hpp>
+ *
+ * struct CustomLess
+ * {
+ *   template <typename DataType>
+ *   __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+ *   {
+ *     return lhs < rhs;
+ *   }
+ * };
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockMergeSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockMergeSort<int, 128, 4> BlockMergeSort;
+ *
+ *     // Allocate shared memory for BlockMergeSort
+ *     __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     BlockMergeSort(temp_storage_shuffle).Sort(thread_keys, CustomLess());
+ *     ...
+ * }
+ * @endcode
+ * @par
+ * Suppose the set of input `thread_keys` across the block of threads is
+ * `{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`.
+ * The corresponding output `thread_keys` in those threads will be
+ * `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+ *
+ * @par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockMergeSort.
+ */
+template <typename KeyT,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          typename ValueT = NullType,
+          int BLOCK_DIM_Y = 1,
+          int BLOCK_DIM_Z = 1>
+class BlockMergeSort
+    : public BlockMergeSortStrategy<KeyT,
+                                    ValueT,
+                                    BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+                                    ITEMS_PER_THREAD,
+                                    BlockMergeSort<KeyT,
+                                                   BLOCK_DIM_X,
+                                                   ITEMS_PER_THREAD,
+                                                   ValueT,
+                                                   BLOCK_DIM_Y,
+                                                   BLOCK_DIM_Z>>
+{
+private:
+  // The thread block size in threads
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * BLOCK_THREADS;
+
+  using BlockMergeSortStrategyT =
+    BlockMergeSortStrategy<KeyT,
+                           ValueT,
+                           BLOCK_THREADS,
+                           ITEMS_PER_THREAD,
+                           BlockMergeSort>;
+
+public:
+  HIPCUB_DEVICE __forceinline__ BlockMergeSort()
+      : BlockMergeSortStrategyT(
+          RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  HIPCUB_DEVICE __forceinline__ explicit BlockMergeSort(
+    typename BlockMergeSortStrategyT::TempStorage &temp_storage)
+      : BlockMergeSortStrategyT(
+          temp_storage,
+          RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+private:
+  HIPCUB_DEVICE __forceinline__ void SyncImplementation() const
+  {
+    CTA_SYNC();
+  }
+
+  friend BlockMergeSortStrategyT;
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_MERGE_SORT_HPP_
--- a/3rdparty/cub/block/block_radix_rank.cuh
+++ b/3rdparty/cub/block/block_radix_rank.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+ */
+
+ #ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
+ #define HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
+
+#include <stdint.h>
+
+#include "../config.hpp"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/radix_rank_sort_operations.hpp"
+
+BEGIN_HIPCUB_NAMESPACE
+
+
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam RADIX_BITS           The number of radix bits per digit place
+ * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * Blah...
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <hipcub/hipcub.hpp>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    bool                    MEMOIZE_OUTER_SCAN   = false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig      SMEM_CONFIG          = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y          = 1,
+    int                     BLOCK_DIM_Z          = 1,
+    int                     ARCH                 = HIPCUB_ARCH /* ignored */>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef typename std::conditional<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
+        unsigned long long,
+        unsigned int>::type PackedCounter;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = Log2<ARCH>::VALUE,
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER           = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES           = rocprim::maximum<int>()((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),                // Always at least one lane
+        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
+        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = rocprim::maximum<int>()(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+
+    /// BlockScan type
+    typedef BlockScan<
+            PackedCounter,
+            BLOCK_DIM_X,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            ARCH>
+        BlockScan;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        union Aliasable
+        {
+            DigitCounter            digit_counters[PADDED_COUNTER_LANES * BLOCK_THREADS * PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS * RAKING_SEGMENT];
+
+        } aliasable;
+
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+    };
+
+#endif
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    HIPCUB_DEVICE inline _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    HIPCUB_DEVICE inline PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = &temp_storage.aliasable.raking_grid[linear_tid * RAKING_SEGMENT];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    HIPCUB_DEVICE inline void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = &temp_storage.aliasable.raking_grid[linear_tid * RAKING_SEGMENT];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    HIPCUB_DEVICE inline void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
+        {
+            #pragma unroll
+            for (int SUB_COUNTER = 0; SUB_COUNTER < PACKING_RATIO; SUB_COUNTER++)
+            {
+                temp_storage.aliasable.digit_counters[(LANE * BLOCK_THREADS + linear_tid) * PACKING_RATIO + SUB_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Block-scan prefix callback
+     */
+    struct PrefixCallBack
+    {
+        HIPCUB_DEVICE inline PackedCounter operator()(PackedCounter block_aggregate)
+        {
+            PackedCounter block_prefix = 0;
+
+            // Propagate totals in packed fields
+            #pragma unroll
+            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+            {
+                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+            }
+
+            return block_prefix;
+        }
+    };
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    HIPCUB_DEVICE inline void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute exclusive sum
+        PackedCounter exclusive_partial;
+        PrefixCallBack prefix_call_back;
+        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
+
+        // Downsweep scan with exclusive partial
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    HIPCUB_DEVICE inline BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    HIPCUB_DEVICE inline BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    HIPCUB_DEVICE inline void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        DigitExtractorT digit_extractor)                    ///< [in] The digit extractor
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Get digit
+            unsigned int digit = digit_extractor.Digit(keys[ITEM]);
+
+            // Get sub-counter
+            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
+
+            // Get counter lane
+            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
+
+            if (IS_DESCENDING)
+            {
+                sub_counter = PACKING_RATIO - 1 - sub_counter;
+                counter_lane = COUNTER_LANES - 1 - counter_lane;
+            }
+
+            // Pointer to smem digit counter
+            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane * BLOCK_THREADS * PACKING_RATIO + linear_tid * PACKING_RATIO + sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+            // Store inclusive prefix
+            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+        }
+
+        ::rocprim::syncthreads();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        ::rocprim::syncthreads();
+
+        // Extract the local ranks of each key
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Add in thread block exclusive prefix
+            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+        }
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    HIPCUB_DEVICE inline void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        // Rank keys
+        RankKeys(keys, ranks, digit_extractor);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+                // first counter column, resulting in unavoidable bank conflicts.)
+                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
+                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counter[counter_lane * BLOCK_THREADS * PACKING_RATIO + sub_counter];
+            }
+        }
+    }
+};
+
+
+
+
+
+/**
+ * Radix-rank using match.any
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     ARCH                = HIPCUB_ARCH>
+class BlockRadixRankMatch
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    typedef int32_t    RankT;
+    typedef int32_t    DigitCounterT;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = Log2<ARCH>::VALUE,
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
+                                    WARPS + 1 :
+                                    WARPS,
+
+        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
+        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
+                                    RAKING_SEGMENT + 1 :
+                                    RAKING_SEGMENT,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = rocprim::maximum<int>()(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+    /// BlockScan type
+    typedef BlockScan<
+            DigitCounterT,
+            BLOCK_THREADS,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            ARCH>
+        BlockScanT;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        typename BlockScanT::TempStorage            block_scan;
+
+        union __align__(16) Aliasable
+        {
+            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS * PADDED_WARPS];
+            DigitCounterT                           raking_grid[BLOCK_THREADS * PADDED_RAKING_SEGMENT];
+
+        } aliasable;
+    };
+#endif
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    HIPCUB_DEVICE inline BlockRadixRankMatch(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        DigitExtractorT digit_extractor)                    ///< [in] The digit extractor
+    {
+        // Initialize shared digit counters
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM] = 0;
+
+        ::rocprim::syncthreads();
+
+        // Each warp will strip-mine its section of input, one strip at a time
+
+        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
+        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
+        uint32_t                lane_mask_lt    = LaneMaskLt();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // My digit
+            uint32_t digit = digit_extractor.Digit(keys[ITEM]);
+
+            if (IS_DESCENDING)
+                digit = RADIX_DIGITS - digit - 1;
+
+            // Mask of peers who have same digit as me
+            uint32_t peer_mask = rocprim::MatchAny<RADIX_BITS>(digit);
+
+            // Pointer to smem digit counter for this key
+            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit * PADDED_WARPS + warp_id];
+
+            // Number of occurrences in previous strips
+            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of peers having same digit as me
+            int32_t digit_count = __popc(peer_mask);
+
+            // Number of lower-ranked peers having same digit seen so far
+            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+            if (peer_digit_prefix == 0)
+            {
+                // First thread for each digit updates the shared warp counter
+                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+            }
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of prior keys having same digit
+            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+        }
+
+        ::rocprim::syncthreads();
+
+        // Scan warp counters
+
+        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM];
+
+        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid * PADDED_RAKING_SEGMENT + ITEM] = scan_counters[ITEM];
+
+        ::rocprim::syncthreads();
+
+        // Seed ranks with counter values from previous warps
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+            ranks[ITEM] += *digit_counters[ITEM];
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, digit_extractor);
+
+        // Get exclusive count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx * PADDED_WARPS];
+            }
+        }
+    }
+};
+
+
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_RANK_HPP_
--- a/3rdparty/cub/block/block_radix_sort.cuh
+++ b/3rdparty/cub/block/block_radix_sort.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/functional.hpp>
+#include <cub/rocprim/block/block_radix_sort.hpp>
+
+#include "block_scan.cuh"
+
+BEGIN_HIPCUB_NAMESPACE
+
+template<
+    typename KeyT,
+    int BLOCK_DIM_X,
+    int ITEMS_PER_THREAD,
+    typename ValueT = NullType,
+    int RADIX_BITS = 4, /* ignored */
+    bool MEMOIZE_OUTER_SCAN = true, /* ignored */
+    BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, /* ignored */
+    cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, /* ignored */
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int PTX_ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockRadixSort
+    : private ::rocprim::block_radix_sort<
+        KeyT,
+        BLOCK_DIM_X,
+        ITEMS_PER_THREAD,
+        ValueT,
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_radix_sort<
+            KeyT,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            ValueT,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockRadixSort() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockRadixSort(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+              int begin_bit = 0,
+              int end_bit = sizeof(KeyT) * 8)
+    {
+        base_type::sort(keys, temp_storage_, begin_bit, end_bit);
+    }
+
+    HIPCUB_DEVICE inline
+    void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+              ValueT (&values)[ITEMS_PER_THREAD],
+              int begin_bit = 0,
+              int end_bit = sizeof(KeyT) * 8)
+    {
+        base_type::sort(keys, values, temp_storage_, begin_bit, end_bit);
+    }
+
+    HIPCUB_DEVICE inline
+    void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8)
+    {
+        base_type::sort_desc(keys, temp_storage_, begin_bit, end_bit);
+    }
+
+    HIPCUB_DEVICE inline
+    void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
+                        ValueT (&values)[ITEMS_PER_THREAD],
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8)
+    {
+        base_type::sort_desc(keys, values, temp_storage_, begin_bit, end_bit);
+    }
+
+    HIPCUB_DEVICE inline
+    void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                              int begin_bit = 0,
+                              int end_bit = sizeof(KeyT) * 8)
+    {
+        base_type::sort_to_striped(keys, temp_storage_, begin_bit, end_bit);
+    }
+
+    HIPCUB_DEVICE inline
+    void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                              ValueT (&values)[ITEMS_PER_THREAD],
+                              int begin_bit = 0,
+                              int end_bit = sizeof(KeyT) * 8)
+    {
+        base_type::sort_to_striped(keys, values, temp_storage_, begin_bit, end_bit);
+    }
+
+    HIPCUB_DEVICE inline
+    void SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                                        int begin_bit = 0,
+                                        int end_bit = sizeof(KeyT) * 8)
+    {
+        base_type::sort_desc_to_striped(keys, temp_storage_, begin_bit, end_bit);
+    }
+
+    HIPCUB_DEVICE inline
+    void SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                                        ValueT (&values)[ITEMS_PER_THREAD],
+                                        int begin_bit = 0,
+                                        int end_bit = sizeof(KeyT) * 8)
+    {
+        base_type::sort_desc_to_striped(keys, values, temp_storage_, begin_bit, end_bit);
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
--- a/3rdparty/cub/block/block_raking_layout.cuh
+++ b/3rdparty/cub/block/block_raking_layout.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RAKING_LAYOUT_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_RAKING_LAYOUT_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+#include <cub/rocprim/config.hpp>
+#include <cub/rocprim/detail/various.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+struct block_raking_layout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = ::rocprim::detail::get_min_warp_size(BLOCK_THREADS, HIPCUB_DEVICE_WARP_THREADS),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+#endif
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static HIPCUB_DEVICE inline T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static HIPCUB_DEVICE inline T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RAKING_LAYOUT_HPP_
--- a/3rdparty/cub/block/block_reduce.cuh
+++ b/3rdparty/cub/block/block_reduce.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
+
+#include <type_traits>
+
+#include <cub/rocprim/block/block_reduce.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+namespace detail
+{
+    inline constexpr
+    typename std::underlying_type<::rocprim::block_reduce_algorithm>::type
+    to_BlockReduceAlgorithm_enum(::rocprim::block_reduce_algorithm v)
+    {
+        using utype = std::underlying_type<::rocprim::block_reduce_algorithm>::type;
+        return static_cast<utype>(v);
+    }
+}
+
+enum BlockReduceAlgorithm
+{
+    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
+        = detail::to_BlockReduceAlgorithm_enum(::rocprim::block_reduce_algorithm::raking_reduce_commutative_only),
+    BLOCK_REDUCE_RAKING
+        = detail::to_BlockReduceAlgorithm_enum(::rocprim::block_reduce_algorithm::raking_reduce),
+    BLOCK_REDUCE_WARP_REDUCTIONS
+        = detail::to_BlockReduceAlgorithm_enum(::rocprim::block_reduce_algorithm::using_warp_reduce)
+};
+
+template<
+    typename T,
+    int BLOCK_DIM_X,
+    BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockReduce
+    : private ::rocprim::block_reduce<
+        T,
+        BLOCK_DIM_X,
+        static_cast<::rocprim::block_reduce_algorithm>(ALGORITHM),
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_reduce<
+            T,
+            BLOCK_DIM_X,
+            static_cast<::rocprim::block_reduce_algorithm>(ALGORITHM),
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockReduce() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockReduce(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    T Sum(T input)
+    {
+        base_type::reduce(input, input, temp_storage_);
+        return input;
+    }
+
+    HIPCUB_DEVICE inline
+    T Sum(T input, int valid_items)
+    {
+        base_type::reduce(input, input, valid_items, temp_storage_);
+        return input;
+    }
+
+    template<int ITEMS_PER_THREAD>
+    HIPCUB_DEVICE inline
+    T Sum(T(&input)[ITEMS_PER_THREAD])
+    {
+        T output;
+        base_type::reduce(input, output, temp_storage_);
+        return output;
+    }
+
+    template<typename ReduceOp>
+    HIPCUB_DEVICE inline
+    T Reduce(T input, ReduceOp reduce_op)
+    {
+        base_type::reduce(input, input, temp_storage_, reduce_op);
+        return input;
+    }
+
+    template<typename ReduceOp>
+    HIPCUB_DEVICE inline
+    T Reduce(T input, ReduceOp reduce_op, int valid_items)
+    {
+        base_type::reduce(input, input, valid_items, temp_storage_, reduce_op);
+        return input;
+    }
+
+    template<int ITEMS_PER_THREAD, typename ReduceOp>
+    HIPCUB_DEVICE inline
+    T Reduce(T(&input)[ITEMS_PER_THREAD], ReduceOp reduce_op)
+    {
+        T output;
+        base_type::reduce(input, output, temp_storage_, reduce_op);
+        return output;
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
--- a/3rdparty/cub/block/block_run_length_decode.hpp
+++ b/3rdparty/cub/block/block_run_length_decode.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
+
+#include "../config.hpp"
+#include "../thread/thread_search.cuh"
+#include "../util_math.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "block_scan.cuh"
+#include <limits>
+#include <type_traits>
+
+BEGIN_HIPCUB_NAMESPACE
+
+/**
+ * \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given
+ * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
+ * array.
+ * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
+ * array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows
+ * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
+ * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
+ *
+ * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
+ * A run of length zero may not be followed by a run length that is not zero.
+ *
+ * \par
+ * \code
+ * __global__ void ExampleKernel(...)
+ * {
+ *   // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
+ *   using RunItemT = uint64_t;
+ *   // Type large enough to index into the run-length decoded array
+ *   using RunLengthT = uint32_t;
+ *
+ *   // Specialising BlockRunLengthDecode for a 1D block of 128 threads
+ *   constexpr int BLOCK_DIM_X = 128;
+ *   // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
+ *   constexpr int RUNS_PER_THREAD = 2;
+ *   // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
+ *   constexpr int DECODED_ITEMS_PER_THREAD = 4;
+ *
+ *   // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *   using BlockRunLengthDecodeT =
+ *     cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+ *
+ *   // Allocate shared memory for BlockRunLengthDecode
+ *   __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
+ *
+ *   // The run-length encoded items and how often they shall be repeated in the run-length decoded output
+ *   RunItemT run_values[RUNS_PER_THREAD];
+ *   RunLengthT run_lengths[RUNS_PER_THREAD];
+ *   ...
+ *
+ *   // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
+ *   uint32_t total_decoded_size = 0;
+ *   BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
+ *
+ *   // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
+ *   // have been decoded.
+ *   uint32_t decoded_window_offset = 0U;
+ *   while (decoded_window_offset < total_decoded_size)
+ *   {
+ *     RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+ *     RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+ *
+ *     // The number of decoded items that are valid within this window (aka pass) of run-length decoding
+ *     uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
+ *     block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
+ *
+ *     decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
+ *
+ *     ...
+ *   }
+ * }
+ * \endcode
+ * \par
+ * Suppose the set of input \p run_values across the block of threads is
+ * <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
+ * \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
+ * The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
+ * [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
+ * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
+ *
+ * \tparam ItemT The data type of the items being run-length decoded
+ * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
+ * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
+ * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
+ * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
+ * runs' lengths)
+ * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
+ * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
+ */
+template <typename ItemT,
+          int BLOCK_DIM_X,
+          int RUNS_PER_THREAD,
+          int DECODED_ITEMS_PER_THREAD,
+          typename DecodedOffsetT = uint32_t,
+          int BLOCK_DIM_Y = 1,
+          int BLOCK_DIM_Z = 1>
+class BlockRunLengthDecode
+{
+    //---------------------------------------------------------------------
+    // CONFIGS & TYPE ALIASES
+    //---------------------------------------------------------------------
+private:
+    /// The thread block size in threads
+    static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+
+    /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
+    static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
+
+    /// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
+    using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_WARP_SCANS, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+    /// Type used to index into the block's runs
+    using RunOffsetT = uint32_t;
+
+    /// Shared memory type required by this thread block
+    union _TempStorage
+    {
+        typename RunOffsetScanT::TempStorage offset_scan;
+        struct
+        {
+            ItemT run_values[BLOCK_RUNS];
+            DecodedOffsetT run_offsets[BLOCK_RUNS];
+        } runs;
+    }; // union TempStorage
+
+    /// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
+    HIPCUB_DEVICE __forceinline__ _TempStorage &PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    uint32_t linear_tid;
+
+public:
+    struct TempStorage : Uninitialized<_TempStorage>
+    {
+    };
+
+    //---------------------------------------------------------------------
+    // CONSTRUCTOR
+    //---------------------------------------------------------------------
+
+    /**
+   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
+   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+   * <b>RunLengthDecode</b> calls.
+   */
+    template <typename RunLengthT, typename TotalDecodedSizeT>
+    HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
+                                                       ItemT (&run_values)[RUNS_PER_THREAD],
+                                                       RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                       TotalDecodedSizeT &total_decoded_size)
+        : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {
+        InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+    }
+
+    /**
+   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
+   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+   * <b>RunLengthDecode</b> calls.
+   */
+    template <typename UserRunOffsetT>
+    HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
+                                                       ItemT (&run_values)[RUNS_PER_THREAD],
+                                                       UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+        : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {
+        InitWithRunOffsets(run_values, run_offsets);
+    }
+
+    /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
+   */
+    template <typename RunLengthT, typename TotalDecodedSizeT>
+    HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                       RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                       TotalDecodedSizeT &total_decoded_size)
+        : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {
+        InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+    }
+
+    /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
+   */
+    template <typename UserRunOffsetT>
+    HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                    UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+        : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {
+        InitWithRunOffsets(run_values, run_offsets);
+    }
+
+private:
+    /**
+   * \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes
+   * \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search
+   * iterations at compile time.
+   */
+    template <int MAX_NUM_ITEMS,
+              typename InputIteratorT,
+              typename OffsetT,
+              typename T>
+    HIPCUB_DEVICE __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence
+                                                           OffsetT num_items,    ///< [in] Input sequence length
+                                                           T val)                ///< [in] Search key
+    {
+        OffsetT lower_bound = 0;
+        OffsetT upper_bound = num_items;
+        #pragma unroll
+        for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
+        {
+            OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
+            mid = (rocprim::min)(mid, num_items - 1);
+
+            if (val < input[mid])
+            {
+                upper_bound = mid;
+            }
+            else
+            {
+                lower_bound = mid + 1;
+            }
+        }
+
+        return lower_bound;
+    }
+
+    template <typename RunOffsetT>
+    HIPCUB_DEVICE __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                          RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+    {
+        // Keep the runs' items and the offsets of each run's beginning in the temporary storage
+        RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
+        #pragma unroll
+        for (int i = 0; i < RUNS_PER_THREAD; i++)
+        {
+            temp_storage.runs.run_values[thread_dst_offset] = run_values[i];
+            temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
+            thread_dst_offset++;
+        }
+
+        // Ensure run offsets and run values have been writen to shared memory
+        CTA_SYNC();
+    }
+
+    template <typename RunLengthT, typename TotalDecodedSizeT>
+    HIPCUB_DEVICE __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                          RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                          TotalDecodedSizeT &total_decoded_size)
+    {
+        // Compute the offset for the beginning of each run
+        DecodedOffsetT run_offsets[RUNS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < RUNS_PER_THREAD; i++)
+        {
+            run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
+        }
+        DecodedOffsetT decoded_size_aggregate;
+        RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
+        total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
+
+        // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
+        CTA_SYNC();
+
+        InitWithRunOffsets(run_values, run_offsets);
+    }
+
+public:
+    /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * <b>RunLengthDecode</b> is not required.
+   * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
+   * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
+   * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
+   * \smemreuse
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+    template <typename RelativeOffsetT>
+    HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                                                       RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
+                                                       DecodedOffsetT from_decoded_offset = 0)
+    {
+        // The (global) offset of the first item decoded by this thread
+        DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
+
+        // The run that the first decoded item of this thread belongs to
+        // If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
+        // last run
+        RunOffsetT assigned_run =
+            StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) -
+            static_cast<RunOffsetT>(1U);
+
+        DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+        // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+        DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
+                                              ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+                                              : temp_storage.runs.run_offsets[assigned_run + 1];
+
+        ItemT val = temp_storage.runs.run_values[assigned_run];
+
+        #pragma unroll
+        for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
+        {
+            decoded_items[i] = val;
+            item_offsets[i] = thread_decoded_offset - assigned_run_begin;
+            if (thread_decoded_offset == assigned_run_end - 1)
+            {
+                // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
+                // extending the last run's length to all the thread's item
+                assigned_run++;
+                assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+                // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+                assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+                                                                    : temp_storage.runs.run_offsets[assigned_run + 1];
+                val = temp_storage.runs.run_values[assigned_run];
+            }
+            thread_decoded_offset++;
+        }
+    }
+
+    /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * <b>RunLengthDecode</b> is not required.
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+    HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                                                       DecodedOffsetT from_decoded_offset = 0)
+    {
+        DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
+        RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
--- a/3rdparty/cub/block/block_scan.cuh
+++ b/3rdparty/cub/block/block_scan.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/block/block_scan.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+namespace detail
+{
+    inline constexpr
+    typename std::underlying_type<::rocprim::block_scan_algorithm>::type
+    to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm v)
+    {
+        using utype = std::underlying_type<::rocprim::block_scan_algorithm>::type;
+        return static_cast<utype>(v);
+    }
+}
+
+enum BlockScanAlgorithm
+{
+    BLOCK_SCAN_RAKING
+        = detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::reduce_then_scan),
+    BLOCK_SCAN_RAKING_MEMOIZE
+        = detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::reduce_then_scan),
+    BLOCK_SCAN_WARP_SCANS
+        = detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::using_warp_scan)
+};
+
+template<
+    typename T,
+    int BLOCK_DIM_X,
+    BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockScan
+    : private ::rocprim::block_scan<
+        T,
+        BLOCK_DIM_X,
+        static_cast<::rocprim::block_scan_algorithm>(ALGORITHM),
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_scan<
+            T,
+            BLOCK_DIM_X,
+            static_cast<::rocprim::block_scan_algorithm>(ALGORITHM),
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockScan() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockScan(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T input, T& output)
+    {
+        base_type::inclusive_scan(input, output, temp_storage_);
+    }
+
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T input, T& output, T& block_aggregate)
+    {
+        base_type::inclusive_scan(input, output, block_aggregate, temp_storage_);
+    }
+
+    template<typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::inclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
+        );
+    }
+
+    template<int ITEMS_PER_THREAD>
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
+    {
+        base_type::inclusive_scan(input, output, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD>
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                      T& block_aggregate)
+    {
+        base_type::inclusive_scan(input, output, block_aggregate, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                      BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::inclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
+        );
+    }
+
+    template<typename ScanOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T input, T& output, ScanOp scan_op)
+    {
+        base_type::inclusive_scan(input, output, temp_storage_, scan_op);
+    }
+
+    template<typename ScanOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
+    {
+        base_type::inclusive_scan(input, output, block_aggregate, temp_storage_, scan_op);
+    }
+
+    template<typename ScanOp, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::inclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, scan_op
+        );
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
+    {
+        base_type::inclusive_scan(input, output, temp_storage_, scan_op);
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       ScanOp scan_op, T& block_aggregate)
+    {
+        base_type::inclusive_scan(input, output, block_aggregate, temp_storage_, scan_op);
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::inclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, scan_op
+        );
+    }
+
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T input, T& output)
+    {
+        base_type::exclusive_scan(input, output, T(0), temp_storage_);
+    }
+
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T input, T& output, T& block_aggregate)
+    {
+        base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
+    }
+
+    template<typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::exclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
+        );
+    }
+
+    template<int ITEMS_PER_THREAD>
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
+    {
+        base_type::exclusive_scan(input, output, T(0), temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD>
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                      T& block_aggregate)
+    {
+        base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
+    }
+
+    template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                      BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::exclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
+        );
+    }
+
+    template<typename ScanOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
+    {
+        base_type::exclusive_scan(input, output, initial_value, temp_storage_, scan_op);
+    }
+
+    template<typename ScanOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T input, T& output, T initial_value,
+                       ScanOp scan_op, T& block_aggregate)
+    {
+        base_type::exclusive_scan(
+            input, output, initial_value, block_aggregate, temp_storage_, scan_op
+        );
+    }
+
+    template<typename ScanOp, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T input, T& output, ScanOp scan_op,
+                       BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::exclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, scan_op
+        );
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       T initial_value, ScanOp scan_op)
+    {
+        base_type::exclusive_scan(input, output, initial_value, temp_storage_, scan_op);
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       T initial_value, ScanOp scan_op, T& block_aggregate)
+    {
+        base_type::exclusive_scan(
+            input, output, initial_value, block_aggregate, temp_storage_, scan_op
+        );
+    }
+
+    template<int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
+    HIPCUB_DEVICE inline
+    void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
+                       ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+    {
+        base_type::exclusive_scan(
+            input, output, temp_storage_, block_prefix_callback_op, scan_op
+        );
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
--- a/3rdparty/cub/block/block_shuffle.cuh
+++ b/3rdparty/cub/block/block_shuffle.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/block/block_shuffle.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+
+
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 ARCH            = HIPCUB_ARCH>
+class BlockShuffle : public ::rocprim::block_shuffle<
+                    T,
+                    BLOCK_DIM_X,
+                    BLOCK_DIM_Y,
+                    BLOCK_DIM_Z>
+{
+  static_assert(
+      BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+      "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+  );
+
+  using base_type =
+      typename ::rocprim::block_shuffle<
+          T,
+          BLOCK_DIM_X,
+          BLOCK_DIM_Y,
+          BLOCK_DIM_Z
+      >;
+
+  // Reference to temporary storage (usually shared memory)
+  typename base_type::storage_type& temp_storage_;
+
+public:
+  using TempStorage = typename base_type::storage_type;
+
+  HIPCUB_DEVICE inline
+  BlockShuffle()  :      temp_storage_(private_storage())
+  {}
+
+
+  HIPCUB_DEVICE inline
+  BlockShuffle(TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+  :      temp_storage_(temp_storage)
+  {}
+
+  /**
+   * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+   *
+   * \par
+   * - \smemreuse
+   */
+  HIPCUB_DEVICE inline void Offset(
+      T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
+      T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+      int distance = 1)           ///< [in] Offset distance (may be negative)
+  {
+    base_type::offset(input,output,distance);
+  }
+
+  /**
+ * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+ *
+ * \par
+ * - \smemreuse
+ */
+  HIPCUB_DEVICE inline void Rotate(
+      T   input,                  ///< [in] The calling thread's input item
+      T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
+      unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+  {
+    base_type::rotate(input,output,distance);
+  }
+  /**
+  * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+  *
+  * \par
+  * - \blocked
+  * - \granularity
+  * - \smemreuse
+  */
+  template <int ITEMS_PER_THREAD>
+  HIPCUB_DEVICE inline void Up(
+    T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+    T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+  {
+    base_type::up(input,prev);
+  }
+
+
+   /**
+   * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
+   *
+   * \par
+   * - \blocked
+   * - \granularity
+   * - \smemreuse
+   */
+  template <int ITEMS_PER_THREAD>
+  HIPCUB_DEVICE inline void Up(
+      T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+      T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+      T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+  {
+    base_type::up(input,prev,block_suffix);
+  }
+
+   /**
+   * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+   *
+   * \par
+   * - \blocked
+   * - \granularity
+   * - \smemreuse
+   */
+  template <int ITEMS_PER_THREAD>
+  HIPCUB_DEVICE inline void Down(
+      T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+      T (&next)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p next[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+  {
+    base_type::down(input,next);
+  }
+
+   /**
+   * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+   *
+   * \par
+   * - \blocked
+   * - \granularity
+   * - \smemreuse
+   */
+  template <int ITEMS_PER_THREAD>
+  HIPCUB_DEVICE inline void Down(
+      T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+      T (&next)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p next[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+      T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+  {
+    base_type::down(input,next,block_prefix);
+  }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
--- a/3rdparty/cub/block/block_store.cuh
+++ b/3rdparty/cub/block/block_store.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+#include "block_store_func.hpp"
+
+#include <cub/rocprim/block/block_store.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+namespace detail
+{
+    inline constexpr
+    typename std::underlying_type<::rocprim::block_store_method>::type
+    to_BlockStoreAlgorithm_enum(::rocprim::block_store_method v)
+    {
+        using utype = std::underlying_type<::rocprim::block_store_method>::type;
+        return static_cast<utype>(v);
+    }
+}
+
+enum BlockStoreAlgorithm
+{
+    BLOCK_STORE_DIRECT
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_direct),
+    BLOCK_STORE_STRIPED
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_striped),
+    BLOCK_STORE_VECTORIZE
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_vectorize),
+    BLOCK_STORE_TRANSPOSE
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_transpose),
+    BLOCK_STORE_WARP_TRANSPOSE
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_warp_transpose),
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
+        = detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_warp_transpose)
+};
+
+template<
+    typename T,
+    int BLOCK_DIM_X,
+    int ITEMS_PER_THREAD,
+    BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT,
+    int BLOCK_DIM_Y = 1,
+    int BLOCK_DIM_Z = 1,
+    int ARCH = HIPCUB_ARCH /* ignored */
+>
+class BlockStore
+    : private ::rocprim::block_store<
+        T,
+        BLOCK_DIM_X,
+        ITEMS_PER_THREAD,
+        static_cast<::rocprim::block_store_method>(ALGORITHM),
+        BLOCK_DIM_Y,
+        BLOCK_DIM_Z
+      >
+{
+    static_assert(
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
+        "BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
+    );
+
+    using base_type =
+        typename ::rocprim::block_store<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            static_cast<::rocprim::block_store_method>(ALGORITHM),
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z
+        >;
+
+    // Reference to temporary storage (usually shared memory)
+    typename base_type::storage_type& temp_storage_;
+
+public:
+    using TempStorage = typename base_type::storage_type;
+
+    HIPCUB_DEVICE inline
+    BlockStore() : temp_storage_(private_storage())
+    {
+    }
+
+    HIPCUB_DEVICE inline
+    BlockStore(TempStorage& temp_storage) : temp_storage_(temp_storage)
+    {
+    }
+
+    template<class OutputIteratorT>
+    HIPCUB_DEVICE inline
+    void Store(OutputIteratorT block_iter,
+               T (&items)[ITEMS_PER_THREAD])
+    {
+        base_type::store(block_iter, items, temp_storage_);
+    }
+
+    template<class OutputIteratorT>
+    HIPCUB_DEVICE inline
+    void Store(OutputIteratorT block_iter,
+               T (&items)[ITEMS_PER_THREAD],
+               int valid_items)
+    {
+        base_type::store(block_iter, items, valid_items, temp_storage_);
+    }
+
+private:
+    HIPCUB_DEVICE inline
+    TempStorage& private_storage()
+    {
+        HIPCUB_SHARED_MEMORY TempStorage private_storage;
+        return private_storage;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
--- a/3rdparty/cub/block/block_store_func.hpp
+++ b/3rdparty/cub/block/block_store_func.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
+#define HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/block/block_store_func.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectBlocked(int linear_id,
+                        OutputIteratorT block_iter,
+                        T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_store_direct_blocked(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectBlocked(int linear_id,
+                        OutputIteratorT block_iter,
+                        T (&items)[ITEMS_PER_THREAD],
+                        int valid_items)
+{
+    ::rocprim::block_store_direct_blocked(
+        linear_id, block_iter, items, valid_items
+    );
+}
+
+template <
+    typename T,
+    int ITEMS_PER_THREAD
+>
+HIPCUB_DEVICE inline
+void StoreDirectBlockedVectorized(int linear_id,
+                                  T* block_iter,
+                                  T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_store_direct_blocked_vectorized(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    int BLOCK_THREADS,
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectStriped(int linear_id,
+                        OutputIteratorT block_iter,
+                        T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_store_direct_striped<BLOCK_THREADS>(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    int BLOCK_THREADS,
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectStriped(int linear_id,
+                        OutputIteratorT block_iter,
+                        T (&items)[ITEMS_PER_THREAD],
+                        int valid_items)
+{
+    ::rocprim::block_store_direct_striped<BLOCK_THREADS>(
+        linear_id, block_iter, items, valid_items
+    );
+}
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectWarpStriped(int linear_id,
+                            OutputIteratorT block_iter,
+                            T (&items)[ITEMS_PER_THREAD])
+{
+    ::rocprim::block_store_direct_warp_striped(
+        linear_id, block_iter, items
+    );
+}
+
+template<
+    typename T,
+    int ITEMS_PER_THREAD,
+    typename OutputIteratorT
+>
+HIPCUB_DEVICE inline
+void StoreDirectWarpStriped(int linear_id,
+                            OutputIteratorT block_iter,
+                            T (&items)[ITEMS_PER_THREAD],
+                            int valid_items)
+{
+    ::rocprim::block_store_direct_warp_striped(
+        linear_id, block_iter, items, valid_items
+    );
+}
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
--- a/3rdparty/cub/block/radix_rank_sort_operations.hpp
+++ b/3rdparty/cub/block/radix_rank_sort_operations.hpp
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * radix_rank_sort_operations.cuh contains common abstractions, definitions and
+ * operations used for radix sorting and ranking.
+ */
+
+ #ifndef HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
+ #define HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+ #include <cub/rocprim/config.hpp>
+ #include <cub/rocprim/type_traits.hpp>
+ #include <cub/rocprim/detail/various.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+/** \brief Twiddling keys for radix sort. */
+template <bool IS_DESCENDING, typename KeyT>
+struct RadixSortTwiddle
+{
+    typedef Traits<KeyT> TraitsT;
+    typedef typename TraitsT::UnsignedBits UnsignedBits;
+    static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits In(UnsignedBits key)
+    {
+        key = TraitsT::TwiddleIn(key);
+        if (IS_DESCENDING) key = ~key;
+        return key;
+    }
+    static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits Out(UnsignedBits key)
+    {
+        if (IS_DESCENDING) key = ~key;
+        key = TraitsT::TwiddleOut(key);
+        return key;
+    }
+    static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits DefaultKey()
+    {
+        return Out(~UnsignedBits(0));
+    }
+};
+
+/** \brief Base struct for digit extractor. Contains common code to provide
+    special handling for floating-point -0.0.
+
+    \note This handles correctly both the case when the keys are
+    bitwise-complemented after twiddling for descending sort (in onesweep) as
+    well as when the keys are not bit-negated, but the implementation handles
+    descending sort separately (in other implementations in CUB). Twiddling
+    alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are
+    subsequent bit patterns and bitwise complements of each other. For onesweep,
+    both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for
+    ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending
+    sort. For all other sorting implementations in CUB, both are always mapped
+    to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other
+    and only one of them is used, the sorting works correctly. For double, the
+    same applies, but with 64-bit patterns.
+*/
+    template <typename KeyT>
+    struct BaseDigitExtractor
+    {
+        typedef Traits<KeyT> TraitsT;
+        typedef typename TraitsT::UnsignedBits UnsignedBits;
+
+        enum
+        {
+            FLOAT_KEY = TraitsT::CATEGORY == FLOATING_POINT,
+        };
+
+        static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
+        {
+            if (!FLOAT_KEY) {
+                return key;
+            } else {
+                UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
+                    TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
+                UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
+                return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
+            }
+        }
+    };
+
+/** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a
+ * key from a digit. */
+    template <typename KeyT>
+    struct BFEDigitExtractor : BaseDigitExtractor<KeyT>
+    {
+        using typename BaseDigitExtractor<KeyT>::UnsignedBits;
+
+        uint32_t bit_start, num_bits;
+        explicit __device__ __forceinline__ BFEDigitExtractor(
+            uint32_t bit_start = 0, uint32_t num_bits = 0)
+            : bit_start(bit_start), num_bits(num_bits)
+        { }
+
+        __device__ __forceinline__ uint32_t Digit(UnsignedBits key)
+        {
+            return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits);
+        }
+    };
+
+/** \brief A wrapper type to extract digits. Uses a combination of shift and
+ * bitwise and to extract digits. */
+    template <typename KeyT>
+    struct ShiftDigitExtractor : BaseDigitExtractor<KeyT>
+    {
+        using typename BaseDigitExtractor<KeyT>::UnsignedBits;
+
+        uint32_t bit_start, mask;
+        explicit __device__ __forceinline__ ShiftDigitExtractor(
+            uint32_t bit_start = 0, uint32_t num_bits = 0)
+            : bit_start(bit_start), mask((1 << num_bits) - 1)
+        { }
+
+        __device__ __forceinline__ uint32_t Digit(UnsignedBits key)
+        {
+            return uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask;
+        }
+    };
+
+END_HIPCUB_NAMESPACE
+
+#endif //HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
--- a/3rdparty/cub/config.hpp
+++ b/3rdparty/cub/config.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2019-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_CONFIG_HPP_
+#define HIPCUB_CONFIG_HPP_
+
+#include <cuda_runtime.h>
+
+#define HIPCUB_NAMESPACE cub
+
+#define BEGIN_HIPCUB_NAMESPACE \
+    namespace cub {
+
+#define END_HIPCUB_NAMESPACE \
+    } /* hipcub */
+    
+#ifndef HIPCUB_ARCH
+#define HIPCUB_ARCH 1
+#endif
+
+#define CUB_DEVICE_WARP_THREADS 64
+
+#ifdef __CUDACC__
+    #define HIPCUB_ROCPRIM_API 1
+    #define HIPCUB_RUNTIME_FUNCTION __host__
+#elif defined(__HIP_PLATFORM_NVIDIA__)
+    #define HIPCUB_CUB_API 1
+    #define HIPCUB_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION
+
+    #include <cub/util_arch.cuh>
+    #define HIPCUB_WARP_THREADS CUB_PTX_WARP_THREADS
+    #define HIPCUB_DEVICE_WARP_THREADS CUB_PTX_WARP_THREADS
+    #define HIPCUB_HOST_WARP_THREADS CUB_PTX_WARP_THREADS
+    #define HIPCUB_ARCH CUB_PTX_ARCH
+    BEGIN_HIPCUB_NAMESPACE
+    using namespace cub;
+    END_HIPCUB_NAMESPACE
+#endif
+
+/// Supported warp sizes
+#define HIPCUB_WARP_SIZE_32 32u
+#define HIPCUB_WARP_SIZE_64 64u
+#define HIPCUB_MAX_WARP_SIZE HIPCUB_WARP_SIZE_64
+
+#define HIPCUB_HOST __host__
+#define HIPCUB_DEVICE __device__
+#define HIPCUB_HOST_DEVICE __host__ __device__
+#define HIPCUB_SHARED_MEMORY __shared__
+
+// Helper macros to disable warnings in clang
+#ifdef __clang__
+#define HIPCUB_PRAGMA_TO_STR(x) _Pragma(#x)
+#define HIPCUB_CLANG_SUPPRESS_WARNING_PUSH _Pragma("clang diagnostic push")
+#define HIPCUB_CLANG_SUPPRESS_WARNING(w) HIPCUB_PRAGMA_TO_STR(clang diagnostic ignored w)
+#define HIPCUB_CLANG_SUPPRESS_WARNING_POP _Pragma("clang diagnostic pop")
+#define HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH(w) \
+    HIPCUB_CLANG_SUPPRESS_WARNING_PUSH HIPCUB_CLANG_SUPPRESS_WARNING(w)
+#else // __clang__
+#define HIPCUB_CLANG_SUPPRESS_WARNING_PUSH
+#define HIPCUB_CLANG_SUPPRESS_WARNING(w)
+#define HIPCUB_CLANG_SUPPRESS_WARNING_POP
+#define HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH(w)
+#endif // __clang__
+
+BEGIN_HIPCUB_NAMESPACE
+
+/// hipCUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(HIPCUB_STDERR)
+    #define HIPCUB_STDERR
+#endif
+
+inline
+cudaError_t Debug(
+    cudaError_t      error,
+    const char*     filename,
+    int             line)
+{
+    (void)filename;
+    (void)line;
+#ifdef HIPCUB_STDERR
+    if (error)
+    {
+        fprintf(stderr, "cuda error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+        fflush(stderr);
+    }
+#endif
+    return error;
+}
+
+#ifndef cubDebug
+    #define cubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_CONFIG_HPP_
--- a/3rdparty/cub/cub.cuh
+++ b/3rdparty/cub/cub.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_HIPCUB_HPP_
+#define HIPCUB_ROCPRIM_HIPCUB_HPP_
+
+#include "config.hpp"
+#include "version.cuh"
+
+#include "util_allocator.cuh"
+#include "util_type.cuh"
+#include "util_ptx.cuh"
+#include "thread/thread_operators.cuh"
+
+// Iterator
+#include "iterator/arg_index_input_iterator.cuh"
+#include "iterator/cache_modified_input_iterator.cuh"
+#include "iterator/cache_modified_output_iterator.cuh"
+#include "iterator/constant_input_iterator.cuh"
+#include "iterator/counting_input_iterator.cuh"
+#include "iterator/discard_output_iterator.cuh"
+#include "iterator/tex_obj_input_iterator.cuh"
+#include "iterator/tex_ref_input_iterator.cuh"
+#include "iterator/transform_input_iterator.cuh"
+
+// Warp
+#include "warp/warp_exchange.hpp"
+#include "warp/warp_load.hpp"
+#include "warp/warp_merge_sort.hpp"
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+#include "warp/warp_store.hpp"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_search.cuh"
+#include "thread/thread_sort.hpp"
+#include "thread/thread_store.cuh"
+
+// Block
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_histogram.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+
+// Device
+#include "device/device_adjacent_difference.hpp"
+#include "device/device_histogram.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_run_length_encode.cuh"
+#include "device/device_scan.cuh"
+#include "device/device_segmented_radix_sort.cuh"
+#include "device/device_segmented_reduce.cuh"
+#include "device/device_segmented_sort.hpp"
+#include "device/device_select.cuh"
+#include "device/device_partition.cuh"
+
+#endif // HIPCUB_ROCPRIM_HIPCUB_HPP_