添加dtk中的cub头文件

f8a481f8 · zhouxiang · 7b7c64c5 · f8a481f8 · f8a481f8 · f8a481f8
Commit f8a481f8 authored Oct 13, 2023 by zhouxiang
20 changed files
--- a/3rdparty/cub/device/device_adjacent_difference.hpp
+++ b/3rdparty/cub/device/device_adjacent_difference.hpp
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2022, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
+
+#include "../config.hpp"
+
+#include <cub/thread/thread_operators.cuh>
+#include <cub/rocprim/device/device_adjacent_difference.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceAdjacentDifference
+{
+    template <typename InputIteratorT,
+              typename OutputIteratorT,
+              typename DifferenceOpT = ::cub::Difference>
+    static HIPCUB_RUNTIME_FUNCTION cudaError_t
+    SubtractLeftCopy(void *d_temp_storage,
+                     std::size_t &temp_storage_bytes,
+                     InputIteratorT d_input,
+                     OutputIteratorT d_output,
+                     std::size_t num_items,
+                     DifferenceOpT difference_op = {},
+                     cudaStream_t stream = 0,
+                     bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::adjacent_difference(
+            d_temp_storage, temp_storage_bytes, d_input, d_output,
+            num_items, difference_op, stream, debug_synchronous
+        );
+    }
+
+    template <typename RandomAccessIteratorT,
+              typename DifferenceOpT = ::cub::Difference>
+    static HIPCUB_RUNTIME_FUNCTION cudaError_t
+    SubtractLeft(void *d_temp_storage,
+                 std::size_t &temp_storage_bytes,
+                 RandomAccessIteratorT d_input,
+                 std::size_t num_items,
+                 DifferenceOpT difference_op = {},
+                 cudaStream_t stream = 0,
+                 bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::adjacent_difference_inplace(
+            d_temp_storage, temp_storage_bytes, d_input,
+            num_items, difference_op, stream, debug_synchronous
+        );
+    }
+
+    template <typename InputIteratorT,
+              typename OutputIteratorT,
+              typename DifferenceOpT = ::cub::Difference>
+    static HIPCUB_RUNTIME_FUNCTION cudaError_t
+    SubtractRightCopy(void *d_temp_storage,
+                      std::size_t &temp_storage_bytes,
+                      InputIteratorT d_input,
+                      OutputIteratorT d_output,
+                      std::size_t num_items,
+                      DifferenceOpT difference_op = {},
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::adjacent_difference_right(
+            d_temp_storage, temp_storage_bytes, d_input, d_output,
+            num_items, difference_op, stream, debug_synchronous
+        );
+    }
+
+    template <typename RandomAccessIteratorT,
+              typename DifferenceOpT = ::cub::Difference>
+    static HIPCUB_RUNTIME_FUNCTION cudaError_t
+    SubtractRight(void *d_temp_storage,
+                  std::size_t &temp_storage_bytes,
+                  RandomAccessIteratorT d_input,
+                  std::size_t num_items,
+                  DifferenceOpT difference_op = {},
+                  cudaStream_t stream = 0,
+                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::adjacent_difference_right_inplace(
+            d_temp_storage, temp_storage_bytes, d_input,
+            num_items, difference_op, stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
--- a/3rdparty/cub/device/device_histogram.cuh
+++ b/3rdparty/cub/device/device_histogram.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_histogram.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceHistogram
+{
+    template<
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t HistogramEven(void * d_temp_storage,
+                             size_t& temp_storage_bytes,
+                             SampleIteratorT d_samples,
+                             CounterT * d_histogram,
+                             int num_levels,
+                             LevelT lower_level,
+                             LevelT upper_level,
+                             OffsetT num_samples,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::histogram_even(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_samples,
+            d_histogram,
+            num_levels, lower_level, upper_level,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t HistogramEven(void * d_temp_storage,
+                             size_t& temp_storage_bytes,
+                             SampleIteratorT d_samples,
+                             CounterT * d_histogram,
+                             int num_levels,
+                             LevelT lower_level,
+                             LevelT upper_level,
+                             OffsetT num_row_samples,
+                             OffsetT num_rows,
+                             size_t row_stride_bytes,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::histogram_even(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_row_samples, num_rows, row_stride_bytes,
+            d_histogram,
+            num_levels, lower_level, upper_level,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        int NUM_CHANNELS,
+        int NUM_ACTIVE_CHANNELS,
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t MultiHistogramEven(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  SampleIteratorT d_samples,
+                                  CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
+                                  int num_levels[NUM_ACTIVE_CHANNELS],
+                                  LevelT lower_level[NUM_ACTIVE_CHANNELS],
+                                  LevelT upper_level[NUM_ACTIVE_CHANNELS],
+                                  OffsetT num_pixels,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        unsigned int levels[NUM_ACTIVE_CHANNELS];
+        for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
+        {
+            levels[channel] = num_levels[channel];
+        }
+        return (cudaError_t)::rocprim::multi_histogram_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_pixels,
+            d_histogram,
+            levels, lower_level, upper_level,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        int NUM_CHANNELS,
+        int NUM_ACTIVE_CHANNELS,
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t MultiHistogramEven(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  SampleIteratorT d_samples,
+                                  CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
+                                  int num_levels[NUM_ACTIVE_CHANNELS],
+                                  LevelT lower_level[NUM_ACTIVE_CHANNELS],
+                                  LevelT upper_level[NUM_ACTIVE_CHANNELS],
+                                  OffsetT num_row_pixels,
+                                  OffsetT num_rows,
+                                  size_t row_stride_bytes,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        unsigned int levels[NUM_ACTIVE_CHANNELS];
+        for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
+        {
+            levels[channel] = num_levels[channel];
+        }
+        return (cudaError_t)::rocprim::multi_histogram_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_row_pixels, num_rows, row_stride_bytes,
+            d_histogram,
+            levels, lower_level, upper_level,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t HistogramRange(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              SampleIteratorT d_samples,
+                              CounterT * d_histogram,
+                              int num_levels,
+                              LevelT * d_levels,
+                              OffsetT num_samples,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::histogram_range(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_samples,
+            d_histogram,
+            num_levels, d_levels,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t HistogramRange(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              SampleIteratorT d_samples,
+                              CounterT * d_histogram,
+                              int num_levels,
+                              LevelT * d_levels,
+                              OffsetT num_row_samples,
+                              OffsetT num_rows,
+                              size_t row_stride_bytes,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::histogram_range(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_row_samples, num_rows, row_stride_bytes,
+            d_histogram,
+            num_levels, d_levels,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        int NUM_CHANNELS,
+        int NUM_ACTIVE_CHANNELS,
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t MultiHistogramRange(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   SampleIteratorT d_samples,
+                                   CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
+                                   int num_levels[NUM_ACTIVE_CHANNELS],
+                                   LevelT * d_levels[NUM_ACTIVE_CHANNELS],
+                                   OffsetT num_pixels,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        unsigned int levels[NUM_ACTIVE_CHANNELS];
+        for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
+        {
+            levels[channel] = num_levels[channel];
+        }
+        return (cudaError_t)::rocprim::multi_histogram_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_pixels,
+            d_histogram,
+            levels, d_levels,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        int NUM_CHANNELS,
+        int NUM_ACTIVE_CHANNELS,
+        typename SampleIteratorT,
+        typename CounterT,
+        typename LevelT,
+        typename OffsetT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t MultiHistogramRange(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   SampleIteratorT d_samples,
+                                   CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
+                                   int num_levels[NUM_ACTIVE_CHANNELS],
+                                   LevelT * d_levels[NUM_ACTIVE_CHANNELS],
+                                   OffsetT num_row_pixels,
+                                   OffsetT num_rows,
+                                   size_t row_stride_bytes,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        unsigned int levels[NUM_ACTIVE_CHANNELS];
+        for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
+        {
+            levels[channel] = num_levels[channel];
+        }
+        return (cudaError_t)::rocprim::multi_histogram_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes,
+            d_samples, num_row_pixels, num_rows, row_stride_bytes,
+            d_histogram,
+            levels, d_levels,
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
--- a/3rdparty/cub/device/device_merge_sort.hpp
+++ b/3rdparty/cub/device/device_merge_sort.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_merge_sort.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceMergeSort
+{
+    template<typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *         d_temp_storage,
+                                                        std::size_t &  temp_storage_bytes,
+                                                        KeyIteratorT   d_keys,
+                                                        ValueIteratorT d_items,
+                                                        OffsetT        num_items,
+                                                        CompareOpT     compare_op,
+                                                        cudaStream_t    stream            = 0,
+                                                        bool           debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_keys,
+                                     d_keys,
+                                     d_items,
+                                     d_items,
+                                     num_items,
+                                     compare_op,
+                                     stream,
+                                     debug_synchronous);
+    }
+
+    template<typename KeyInputIteratorT,
+             typename ValueInputIteratorT,
+             typename KeyIteratorT,
+             typename ValueIteratorT,
+             typename OffsetT,
+             typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(void *              d_temp_storage,
+                                                            std::size_t &       temp_storage_bytes,
+                                                            KeyInputIteratorT   d_input_keys,
+                                                            ValueInputIteratorT d_input_items,
+                                                            KeyIteratorT        d_output_keys,
+                                                            ValueIteratorT      d_output_items,
+                                                            OffsetT             num_items,
+                                                            CompareOpT          compare_op,
+                                                            cudaStream_t         stream = 0,
+                                                            bool debug_synchronous     = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_input_keys,
+                                     d_output_keys,
+                                     d_input_items,
+                                     d_output_items,
+                                     num_items,
+                                     compare_op,
+                                     stream,
+                                     debug_synchronous);
+    }
+
+    template<typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *        d_temp_storage,
+                                                       std::size_t & temp_storage_bytes,
+                                                       KeyIteratorT  d_keys,
+                                                       OffsetT       num_items,
+                                                       CompareOpT    compare_op,
+                                                       cudaStream_t   stream            = 0,
+                                                       bool          debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, d_keys, num_items,
+            compare_op, stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyInputIteratorT,
+             typename KeyIteratorT,
+             typename OffsetT,
+             typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(void * d_temp_storage,
+                                                           std::size_t & temp_storage_bytes,
+                                                           KeyInputIteratorT d_input_keys,
+                                                           KeyIteratorT      d_output_keys,
+                                                           OffsetT           num_items,
+                                                           CompareOpT        compare_op,
+                                                           cudaStream_t stream = 0,
+                                                           bool debug_synchronous = false)
+
+    {
+        return (cudaError_t)::rocprim::merge_sort(
+            d_temp_storage, temp_storage_bytes,
+            d_input_keys, d_output_keys, num_items,
+            compare_op, stream, debug_synchronous
+            );
+    }
+
+    template <typename KeyIteratorT,
+             typename ValueIteratorT,
+             typename OffsetT,
+             typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t
+    StableSortPairs(void *d_temp_storage,
+                    std::size_t &temp_storage_bytes,
+                    KeyIteratorT d_keys,
+                    ValueIteratorT d_items,
+                    OffsetT num_items,
+                    CompareOpT compare_op,
+                    cudaStream_t stream = 0,
+                    bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_keys,
+                                     d_keys,
+                                     d_items,
+                                     d_items,
+                                     num_items,
+                                     compare_op,
+                                     stream,
+                                     debug_synchronous);
+    }
+
+    template<typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+    HIPCUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void *        d_temp_storage,
+                                                             std::size_t & temp_storage_bytes,
+                                                             KeyIteratorT  d_keys,
+                                                             OffsetT       num_items,
+                                                             CompareOpT    compare_op,
+                                                             cudaStream_t   stream   = 0,
+                                                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::merge_sort(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, d_keys, num_items,
+            compare_op, stream, debug_synchronous
+        );
+    }
+
+};
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
--- a/3rdparty/cub/device/device_partition.cuh
+++ b/3rdparty/cub/device/device_partition.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_PARTITION_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_PARTITION_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/device/device_partition.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DevicePartition
+{
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    HIPCUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                 stream             = 0,         ///< [in] <b>[optional]</b> hip stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return (cudaError_t)rocprim::partition(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    HIPCUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                 stream             = 0,         ///< [in] <b>[optional]</b> hip stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return (cudaError_t)rocprim::partition(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            d_num_selected_out,
+            num_items,
+            select_op,
+            stream,
+            debug_synchronous);
+    }
+    
+    template <typename InputIteratorT,
+              typename FirstOutputIteratorT,
+              typename SecondOutputIteratorT,
+              typename UnselectedOutputIteratorT,
+              typename NumSelectedIteratorT,
+              typename SelectFirstPartOp,
+              typename SelectSecondPartOp>
+    HIPCUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+    If(void *d_temp_storage,
+       std::size_t &temp_storage_bytes,
+       InputIteratorT d_in,
+       FirstOutputIteratorT d_first_part_out,
+       SecondOutputIteratorT d_second_part_out,
+       UnselectedOutputIteratorT d_unselected_out,
+       NumSelectedIteratorT d_num_selected_out,
+       int num_items,
+       SelectFirstPartOp select_first_part_op,
+       SelectSecondPartOp select_second_part_op,
+       cudaStream_t stream     = 0,
+       bool debug_synchronous = false)
+    {
+        return (cudaError_t)rocprim::partition_three_way(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_first_part_out,
+            d_second_part_out,
+            d_unselected_out,
+            d_num_selected_out,
+            num_items,
+            select_first_part_op,
+            select_second_part_op,
+            stream,
+            debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif
--- a/3rdparty/cub/device/device_radix_sort.cuh
+++ b/3rdparty/cub/device/device_radix_sort.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_radix_sort.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceRadixSort
+{
+    template<typename KeyT, typename ValueT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         const KeyT * d_keys_in,
+                         KeyT * d_keys_out,
+                         const ValueT * d_values_in,
+                         ValueT * d_values_out,
+                         NumItemsT num_items,
+                         int begin_bit = 0,
+                         int end_bit = sizeof(KeyT) * 8,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         DoubleBuffer<KeyT>& d_keys,
+                         DoubleBuffer<ValueT>& d_values,
+                         NumItemsT num_items,
+                         int begin_bit = 0,
+                         int end_bit = sizeof(KeyT) * 8,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename ValueT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   const KeyT * d_keys_in,
+                                   KeyT * d_keys_out,
+                                   const ValueT * d_values_in,
+                                   ValueT * d_values_out,
+                                   NumItemsT num_items,
+                                   int begin_bit = 0,
+                                   int end_bit = sizeof(KeyT) * 8,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   DoubleBuffer<KeyT>& d_keys,
+                                   DoubleBuffer<ValueT>& d_values,
+                                   NumItemsT num_items,
+                                   int begin_bit = 0,
+                                   int end_bit = sizeof(KeyT) * 8,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        const KeyT * d_keys_in,
+                        KeyT * d_keys_out,
+                        NumItemsT num_items,
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        DoubleBuffer<KeyT>& d_keys,
+                        NumItemsT num_items,
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+
+    template<typename KeyT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  const KeyT * d_keys_in,
+                                  KeyT * d_keys_out,
+                                  NumItemsT num_items,
+                                  int begin_bit = 0,
+                                  int end_bit = sizeof(KeyT) * 8,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename NumItemsT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  DoubleBuffer<KeyT>& d_keys,
+                                  NumItemsT num_items,
+                                  int begin_bit = 0,
+                                  int end_bit = sizeof(KeyT) * 8,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
--- a/3rdparty/cub/device/device_reduce.cuh
+++ b/3rdparty/cub/device/device_reduce.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
+
+#include <limits>
+#include <iterator>
+
+#include <cuda_fp16.h> // __half
+#include <thrust/system/cuda/cuda_bfloat16.h> // hip_bfloat16
+
+#include "../config.hpp"
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/device/device_reduce.hpp>
+#include <cub/rocprim/device/device_reduce_by_key.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+namespace detail
+{
+
+template<class T>
+inline
+T get_lowest_value()
+{
+    return std::numeric_limits<T>::lowest();
+}
+
+template<>
+inline
+__half get_lowest_value<__half>()
+{
+    unsigned short lowest_half = 0xfbff;
+    __half lowest_value = *reinterpret_cast<__half*>(&lowest_half);
+    return lowest_value;
+}
+
+template<>
+inline
+cuda_bfloat16 get_lowest_value<cuda_bfloat16>()
+{
+    return cuda_bfloat16(-3.38953138925e+38f);
+}
+
+template<class T>
+inline
+T get_max_value()
+{
+    return std::numeric_limits<T>::max();
+}
+
+template<>
+inline
+__half get_max_value<__half>()
+{
+    unsigned short max_half = 0x7bff;
+    __half max_value = *reinterpret_cast<__half*>(&max_half);
+    return max_value;
+}
+
+template<>
+inline
+cuda_bfloat16 get_max_value<cuda_bfloat16>()
+{
+    return cuda_bfloat16(3.38953138925e+38f);
+}
+
+} // end detail namespace
+
+class DeviceReduce
+{
+public:
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename ReduceOpT,
+        typename T
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Reduce(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_items,
+                      ReduceOpT reduction_op,
+                      T init,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, init, num_items,
+            ::cub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Sum(void *d_temp_storage,
+                   size_t &temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_items,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, ::cub::Sum(), T(0),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Min(void *d_temp_storage,
+                   size_t &temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_items,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, ::cub::Min(), detail::get_max_value<T>(),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ArgMin(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_items,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        using OffsetT = int;
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        using O = typename std::iterator_traits<OutputIteratorT>::value_type;
+        using OutputTupleT =
+            typename std::conditional<
+                std::is_same<O, void>::value,
+                KeyValuePair<OffsetT, T>,
+                O
+            >::type;
+
+        using OutputValueT = typename OutputTupleT::Value;
+        using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        IteratorT d_indexed_in(d_in);
+        OutputTupleT init(1, detail::get_max_value<T>());
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_indexed_in, d_out, num_items, ::cub::ArgMin(), init,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Max(void *d_temp_storage,
+                   size_t &temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_items,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, ::cub::Max(), detail::get_lowest_value<T>(),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ArgMax(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_items,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        using OffsetT = int;
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        using O = typename std::iterator_traits<OutputIteratorT>::value_type;
+        using OutputTupleT =
+            typename std::conditional<
+                std::is_same<O, void>::value,
+                KeyValuePair<OffsetT, T>,
+                O
+            >::type;
+
+        using OutputValueT = typename OutputTupleT::Value;
+        using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        IteratorT d_indexed_in(d_in);
+        OutputTupleT init(1, detail::get_lowest_value<T>());
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_indexed_in, d_out, num_items, ::cub::ArgMax(), init,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename KeysInputIteratorT,
+        typename UniqueOutputIteratorT,
+        typename ValuesInputIteratorT,
+        typename AggregatesOutputIteratorT,
+        typename NumRunsOutputIteratorT,
+        typename ReductionOpT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ReduceByKey(void * d_temp_storage,
+                           size_t& temp_storage_bytes,
+                           KeysInputIteratorT d_keys_in,
+                           UniqueOutputIteratorT d_unique_out,
+                           ValuesInputIteratorT d_values_in,
+                           AggregatesOutputIteratorT d_aggregates_out,
+                           NumRunsOutputIteratorT d_num_runs_out,
+                           ReductionOpT reduction_op,
+                           int num_items,
+                           cudaStream_t stream = 0,
+                           bool debug_synchronous = false)
+    {
+        using key_compare_op =
+            ::rocprim::equal_to<typename std::iterator_traits<KeysInputIteratorT>::value_type>;
+        return (cudaError_t)::rocprim::reduce_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, num_items,
+            d_unique_out, d_aggregates_out, d_num_runs_out,
+            ::cub::detail::convert_result_type<ValuesInputIteratorT, AggregatesOutputIteratorT>(reduction_op),
+            key_compare_op(),
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
--- a/3rdparty/cub/device/device_run_length_encode.cuh
+++ b/3rdparty/cub/device/device_run_length_encode.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
+
+#include "../config.hpp"
+
+#include <cub/rocprim/device/device_run_length_encode.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+class DeviceRunLengthEncode
+{
+public:
+    template<
+        typename InputIteratorT,
+        typename UniqueOutputIteratorT,
+        typename LengthsOutputIteratorT,
+        typename NumRunsOutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Encode(void * d_temp_storage,
+                      size_t& temp_storage_bytes,
+                      InputIteratorT d_in,
+                      UniqueOutputIteratorT d_unique_out,
+                      LengthsOutputIteratorT d_counts_out,
+                      NumRunsOutputIteratorT d_num_runs_out,
+                      int num_items,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::run_length_encode(
+            d_temp_storage, temp_storage_bytes,
+            d_in, num_items,
+            d_unique_out, d_counts_out, d_num_runs_out,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OffsetsOutputIteratorT,
+        typename LengthsOutputIteratorT,
+        typename NumRunsOutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t NonTrivialRuns(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              InputIteratorT d_in,
+                              OffsetsOutputIteratorT d_offsets_out,
+                              LengthsOutputIteratorT d_lengths_out,
+                              NumRunsOutputIteratorT d_num_runs_out,
+                              int num_items,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::run_length_encode_non_trivial_runs(
+            d_temp_storage, temp_storage_bytes,
+            d_in, num_items,
+            d_offsets_out, d_lengths_out, d_num_runs_out,
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
--- a/3rdparty/cub/device/device_scan.cuh
+++ b/3rdparty/cub/device/device_scan.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
+
+#include <iostream>
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/device/device_scan.hpp>
+#include <cub/rocprim/device/device_scan_by_key.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+class DeviceScan
+{
+public:
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t InclusiveSum(void *d_temp_storage,
+                            size_t &temp_storage_bytes,
+                            InputIteratorT d_in,
+                            OutputIteratorT d_out,
+                            size_t num_items,
+                            cudaStream_t stream = 0,
+                            bool debug_synchronous = false)
+    {
+        return InclusiveScan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, ::cub::Sum(), num_items,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename ScanOpT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t InclusiveScan(void *d_temp_storage,
+                             size_t &temp_storage_bytes,
+                             InputIteratorT d_in,
+                             OutputIteratorT d_out,
+                             ScanOpT scan_op,
+                             size_t num_items,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::inclusive_scan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items,
+            scan_op,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveSum(void *d_temp_storage,
+                            size_t &temp_storage_bytes,
+                            InputIteratorT d_in,
+                            OutputIteratorT d_out,
+                            size_t num_items,
+                            cudaStream_t stream = 0,
+                            bool debug_synchronous = false)
+    {
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        return ExclusiveScan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, ::cub::Sum(), T(0), num_items,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename ScanOpT,
+        typename InitValueT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveScan(void *d_temp_storage,
+                             size_t &temp_storage_bytes,
+                             InputIteratorT d_in,
+                             OutputIteratorT d_out,
+                             ScanOpT scan_op,
+                             InitValueT init_value,
+                             size_t num_items,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::exclusive_scan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, init_value, num_items,
+            scan_op,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename ScanOpT,
+        typename InitValueT,
+        typename InitValueIterT = InitValueT*
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveScan(void *d_temp_storage,
+                             size_t &temp_storage_bytes,
+                             InputIteratorT d_in,
+                             OutputIteratorT d_out,
+                             ScanOpT scan_op,
+                             FutureValue<InitValueT, InitValueIterT> init_value,
+                             int num_items,
+                             cudaStream_t stream = 0,
+                             bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::exclusive_scan(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, init_value, num_items,
+            scan_op,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeysInputIteratorT,
+        typename ValuesInputIteratorT,
+        typename ValuesOutputIteratorT,
+        typename EqualityOpT = ::cub::Equality
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveSumByKey(void *d_temp_storage,
+                                 size_t &temp_storage_bytes,
+                                 KeysInputIteratorT d_keys_in,
+                                 ValuesInputIteratorT d_values_in,
+                                 ValuesOutputIteratorT d_values_out,
+                                 int num_items,
+                                 EqualityOpT equality_op = EqualityOpT(),
+                                 cudaStream_t stream = 0,
+                                 bool debug_synchronous = false)
+    {
+        using in_value_type = typename std::iterator_traits<ValuesInputIteratorT>::value_type;
+
+        return (cudaError_t)::rocprim::exclusive_scan_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, d_values_out,
+            static_cast<in_value_type>(0), static_cast<size_t>(num_items),
+            ::cub::Sum(), equality_op, stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeysInputIteratorT,
+        typename ValuesInputIteratorT,
+        typename ValuesOutputIteratorT,
+        typename ScanOpT,
+        typename InitValueT,
+        typename EqualityOpT = ::cub::Equality
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ExclusiveScanByKey(void *d_temp_storage,
+                                  size_t &temp_storage_bytes,
+                                  KeysInputIteratorT d_keys_in,
+                                  ValuesInputIteratorT d_values_in,
+                                  ValuesOutputIteratorT d_values_out,
+                                  ScanOpT scan_op,
+                                  InitValueT init_value,
+                                  int num_items,
+                                  EqualityOpT equality_op = EqualityOpT(),
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::exclusive_scan_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, d_values_out,
+            init_value, static_cast<size_t>(num_items),
+            scan_op, equality_op, stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeysInputIteratorT,
+        typename ValuesInputIteratorT,
+        typename ValuesOutputIteratorT,
+        typename EqualityOpT = ::cub::Equality
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t InclusiveSumByKey(void *d_temp_storage,
+                                 size_t &temp_storage_bytes,
+                                 KeysInputIteratorT d_keys_in,
+                                 ValuesInputIteratorT d_values_in,
+                                 ValuesOutputIteratorT d_values_out,
+                                 int num_items,
+                                 EqualityOpT equality_op = EqualityOpT(),
+                                 cudaStream_t stream = 0,
+                                 bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::inclusive_scan_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, d_values_out,
+            static_cast<size_t>(num_items), ::cub::Sum(),
+            equality_op, stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeysInputIteratorT,
+        typename ValuesInputIteratorT,
+        typename ValuesOutputIteratorT,
+        typename ScanOpT,
+        typename EqualityOpT = ::cub::Equality
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t InclusiveScanByKey(void *d_temp_storage,
+                                  size_t &temp_storage_bytes,
+                                  KeysInputIteratorT d_keys_in,
+                                  ValuesInputIteratorT d_values_in,
+                                  ValuesOutputIteratorT d_values_out,
+                                  ScanOpT scan_op,
+                                  int num_items,
+                                  EqualityOpT equality_op = EqualityOpT(),
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::inclusive_scan_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_values_in, d_values_out,
+            static_cast<size_t>(num_items), scan_op,
+            equality_op, stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
--- a/3rdparty/cub/device/device_segmented_radix_sort.cuh
+++ b/3rdparty/cub/device/device_segmented_radix_sort.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_segmented_radix_sort.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceSegmentedRadixSort
+{
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         const KeyT * d_keys_in,
+                         KeyT * d_keys_out,
+                         const ValueT * d_values_in,
+                         ValueT * d_values_out,
+                         int num_items,
+                         int num_segments,
+                         OffsetIteratorT d_begin_offsets,
+                         OffsetIteratorT d_end_offsets,
+                         int begin_bit = 0,
+                         int end_bit = sizeof(KeyT) * 8,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         DoubleBuffer<KeyT>& d_keys,
+                         DoubleBuffer<ValueT>& d_values,
+                         int num_items,
+                         int num_segments,
+                         OffsetIteratorT d_begin_offsets,
+                         OffsetIteratorT d_end_offsets,
+                         int begin_bit = 0,
+                         int end_bit = sizeof(KeyT) * 8,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   const KeyT * d_keys_in,
+                                   KeyT * d_keys_out,
+                                   const ValueT * d_values_in,
+                                   ValueT * d_values_out,
+                                   int num_items,
+                                   int num_segments,
+                                   OffsetIteratorT d_begin_offsets,
+                                   OffsetIteratorT d_end_offsets,
+                                   int begin_bit = 0,
+                                   int end_bit = sizeof(KeyT) * 8,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   DoubleBuffer<KeyT>& d_keys,
+                                   DoubleBuffer<ValueT>& d_values,
+                                   int num_items,
+                                   int num_segments,
+                                   OffsetIteratorT d_begin_offsets,
+                                   OffsetIteratorT d_end_offsets,
+                                   int begin_bit = 0,
+                                   int end_bit = sizeof(KeyT) * 8,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        const KeyT * d_keys_in,
+                        KeyT * d_keys_out,
+                        int num_items,
+                        int num_segments,
+                        OffsetIteratorT d_begin_offsets,
+                        OffsetIteratorT d_end_offsets,
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        DoubleBuffer<KeyT>& d_keys,
+                        int num_items,
+                        int num_segments,
+                        OffsetIteratorT d_begin_offsets,
+                        OffsetIteratorT d_end_offsets,
+                        int begin_bit = 0,
+                        int end_bit = sizeof(KeyT) * 8,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  const KeyT * d_keys_in,
+                                  KeyT * d_keys_out,
+                                  int num_items,
+                                  int num_segments,
+                                  OffsetIteratorT d_begin_offsets,
+                                  OffsetIteratorT d_end_offsets,
+                                  int begin_bit = 0,
+                                  int end_bit = sizeof(KeyT) * 8,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  DoubleBuffer<KeyT>& d_keys,
+                                  int num_items,
+                                  int num_segments,
+                                  OffsetIteratorT d_begin_offsets,
+                                  OffsetIteratorT d_end_offsets,
+                                  int begin_bit = 0,
+                                  int end_bit = sizeof(KeyT) * 8,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            begin_bit, end_bit,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
--- a/3rdparty/cub/device/device_segmented_reduce.cuh
+++ b/3rdparty/cub/device/device_segmented_reduce.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
+
+#include <limits>
+#include <iterator>
+
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+#include "../iterator/arg_index_input_iterator.cuh"
+
+#include <cub/rocprim/device/device_segmented_reduce.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceSegmentedReduce
+{
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT,
+        typename ReductionOp,
+        typename T
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Reduce(void * d_temp_storage,
+                      size_t& temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_segments,
+                      OffsetIteratorT d_begin_offsets,
+                      OffsetIteratorT d_end_offsets,
+                      ReductionOp reduction_op,
+                      T initial_value,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
+            initial_value,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Sum(void * d_temp_storage,
+                   size_t& temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_segments,
+                   OffsetIteratorT d_begin_offsets,
+                   OffsetIteratorT d_end_offsets,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::Sum(), input_type(),
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Min(void * d_temp_storage,
+                   size_t& temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_segments,
+                   OffsetIteratorT d_begin_offsets,
+                   OffsetIteratorT d_end_offsets,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::Min(), std::numeric_limits<input_type>::max(),
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ArgMin(void * d_temp_storage,
+                      size_t& temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_segments,
+                      OffsetIteratorT d_begin_offsets,
+                      OffsetIteratorT d_end_offsets,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        using OffsetT = int;
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        using O = typename std::iterator_traits<OutputIteratorT>::value_type;
+        using OutputTupleT = typename std::conditional<
+                                 std::is_same<O, void>::value,
+                                 KeyValuePair<OffsetT, T>,
+                                 O
+                             >::type;
+
+        using OutputValueT = typename OutputTupleT::Value;
+        using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        IteratorT d_indexed_in(d_in);
+        const OutputTupleT init(1, std::numeric_limits<T>::max());
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_indexed_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::ArgMin(), init,
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Max(void * d_temp_storage,
+                   size_t& temp_storage_bytes,
+                   InputIteratorT d_in,
+                   OutputIteratorT d_out,
+                   int num_segments,
+                   OffsetIteratorT d_begin_offsets,
+                   OffsetIteratorT d_end_offsets,
+                   cudaStream_t stream = 0,
+                   bool debug_synchronous = false)
+    {
+        using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::Max(), std::numeric_limits<input_type>::lowest(),
+            stream, debug_synchronous
+        );
+    }
+
+    template<
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename OffsetIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t ArgMax(void * d_temp_storage,
+                      size_t& temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      int num_segments,
+                      OffsetIteratorT d_begin_offsets,
+                      OffsetIteratorT d_end_offsets,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        using OffsetT = int;
+        using T = typename std::iterator_traits<InputIteratorT>::value_type;
+        using O = typename std::iterator_traits<OutputIteratorT>::value_type;
+        using OutputTupleT = typename std::conditional<
+                                 std::is_same<O, void>::value,
+                                 KeyValuePair<OffsetT, T>,
+                                 O
+                             >::type;
+
+        using OutputValueT = typename OutputTupleT::Value;
+        using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        IteratorT d_indexed_in(d_in);
+        const OutputTupleT init(1, std::numeric_limits<T>::lowest());
+
+        return Reduce(
+            d_temp_storage, temp_storage_bytes,
+            d_indexed_in, d_out,
+            num_segments, d_begin_offsets, d_end_offsets,
+            ::cub::ArgMax(), init,
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
--- a/3rdparty/cub/device/device_segmented_sort.hpp
+++ b/3rdparty/cub/device/device_segmented_sort.hpp
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
+
+#include "../config.hpp"
+
+#include "../util_type.cuh"
+
+#include <cub/rocprim/device/device_segmented_radix_sort.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceSegmentedSort
+{
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         const KeyT * d_keys_in,
+                         KeyT * d_keys_out,
+                         const ValueT * d_values_in,
+                         ValueT * d_values_out,
+                         int num_items,
+                         int num_segments,
+                         OffsetIteratorT d_begin_offsets,
+                         OffsetIteratorT d_end_offsets,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairs(void * d_temp_storage,
+                         size_t& temp_storage_bytes,
+                         DoubleBuffer<KeyT>& d_keys,
+                         DoubleBuffer<ValueT>& d_values,
+                         int num_items,
+                         int num_segments,
+                         OffsetIteratorT d_begin_offsets,
+                         OffsetIteratorT d_end_offsets,
+                         cudaStream_t stream = 0,
+                         bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   const KeyT * d_keys_in,
+                                   KeyT * d_keys_out,
+                                   const ValueT * d_values_in,
+                                   ValueT * d_values_out,
+                                   int num_items,
+                                   int num_segments,
+                                   OffsetIteratorT d_begin_offsets,
+                                   OffsetIteratorT d_end_offsets,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortPairsDescending(void * d_temp_storage,
+                                   size_t& temp_storage_bytes,
+                                   DoubleBuffer<KeyT>& d_keys,
+                                   DoubleBuffer<ValueT>& d_values,
+                                   int num_items,
+                                   int num_segments,
+                                   OffsetIteratorT d_begin_offsets,
+                                   OffsetIteratorT d_end_offsets,
+                                   cudaStream_t stream = 0,
+                                   bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        ::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, d_values_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        detail::update_double_buffer(d_values, d_values_db);
+        return error;
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        const KeyT * d_keys_in,
+                        KeyT * d_keys_out,
+                        int num_items,
+                        int num_segments,
+                        OffsetIteratorT d_begin_offsets,
+                        OffsetIteratorT d_end_offsets,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeys(void * d_temp_storage,
+                        size_t& temp_storage_bytes,
+                        DoubleBuffer<KeyT>& d_keys,
+                        int num_items,
+                        int num_segments,
+                        OffsetIteratorT d_begin_offsets,
+                        OffsetIteratorT d_end_offsets,
+                        cudaStream_t stream = 0,
+                        bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  const KeyT * d_keys_in,
+                                  KeyT * d_keys_out,
+                                  int num_items,
+                                  int num_segments,
+                                  OffsetIteratorT d_begin_offsets,
+                                  OffsetIteratorT d_end_offsets,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t SortKeysDescending(void * d_temp_storage,
+                                  size_t& temp_storage_bytes,
+                                  DoubleBuffer<KeyT>& d_keys,
+                                  int num_items,
+                                  int num_segments,
+                                  OffsetIteratorT d_begin_offsets,
+                                  OffsetIteratorT d_end_offsets,
+                                  cudaStream_t stream = 0,
+                                  bool debug_synchronous = false)
+    {
+        ::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
+        cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_db, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            0, sizeof(KeyT) * 8,
+            stream, debug_synchronous
+        );
+        detail::update_double_buffer(d_keys, d_keys_db);
+        return error;
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortPairs(void * d_temp_storage,
+                               size_t& temp_storage_bytes,
+                               const KeyT * d_keys_in,
+                               KeyT * d_keys_out,
+                               const ValueT * d_values_in,
+                               ValueT * d_values_out,
+                               int num_items,
+                               int num_segments,
+                               OffsetIteratorT d_begin_offsets,
+                               OffsetIteratorT d_end_offsets,
+                               cudaStream_t stream = 0,
+                               bool debug_synchronous = false)
+    {
+        return SortPairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortPairs(void * d_temp_storage,
+                               size_t& temp_storage_bytes,
+                               DoubleBuffer<KeyT>& d_keys,
+                               DoubleBuffer<ValueT>& d_values,
+                               int num_items,
+                               int num_segments,
+                               OffsetIteratorT d_begin_offsets,
+                               OffsetIteratorT d_end_offsets,
+                               cudaStream_t stream = 0,
+                               bool debug_synchronous = false)
+    {
+        return SortPairs(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, d_values, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortPairsDescending(void * d_temp_storage,
+                                         size_t& temp_storage_bytes,
+                                         const KeyT * d_keys_in,
+                                         KeyT * d_keys_out,
+                                         const ValueT * d_values_in,
+                                         ValueT * d_values_out,
+                                         int num_items,
+                                         int num_segments,
+                                         OffsetIteratorT d_begin_offsets,
+                                         OffsetIteratorT d_end_offsets,
+                                         cudaStream_t stream = 0,
+                                         bool debug_synchronous = false)
+    {
+        return SortPairsDescending(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename ValueT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortPairsDescending(void * d_temp_storage,
+                                         size_t& temp_storage_bytes,
+                                         DoubleBuffer<KeyT>& d_keys,
+                                         DoubleBuffer<ValueT>& d_values,
+                                         int num_items,
+                                         int num_segments,
+                                         OffsetIteratorT d_begin_offsets,
+                                         OffsetIteratorT d_end_offsets,
+                                         cudaStream_t stream = 0,
+                                         bool debug_synchronous = false)
+    {
+        return SortPairsDescending(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, d_values, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortKeys(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              const KeyT * d_keys_in,
+                              KeyT * d_keys_out,
+                              int num_items,
+                              int num_segments,
+                              OffsetIteratorT d_begin_offsets,
+                              OffsetIteratorT d_end_offsets,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return SortKeys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortKeys(void * d_temp_storage,
+                              size_t& temp_storage_bytes,
+                              DoubleBuffer<KeyT>& d_keys,
+                              int num_items,
+                              int num_segments,
+                              OffsetIteratorT d_begin_offsets,
+                              OffsetIteratorT d_end_offsets,
+                              cudaStream_t stream = 0,
+                              bool debug_synchronous = false)
+    {
+        return SortKeys(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortKeysDescending(void * d_temp_storage,
+                                        size_t& temp_storage_bytes,
+                                        const KeyT * d_keys_in,
+                                        KeyT * d_keys_out,
+                                        int num_items,
+                                        int num_segments,
+                                        OffsetIteratorT d_begin_offsets,
+                                        OffsetIteratorT d_end_offsets,
+                                        cudaStream_t stream = 0,
+                                        bool debug_synchronous = false)
+    {
+        return SortKeysDescending(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_in, d_keys_out, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+
+    template<typename KeyT, typename OffsetIteratorT>
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t StableSortKeysDescending(void * d_temp_storage,
+                                        size_t& temp_storage_bytes,
+                                        DoubleBuffer<KeyT>& d_keys,
+                                        int num_items,
+                                        int num_segments,
+                                        OffsetIteratorT d_begin_offsets,
+                                        OffsetIteratorT d_end_offsets,
+                                        cudaStream_t stream = 0,
+                                        bool debug_synchronous = false)
+    {
+        return SortKeysDescending(
+            d_temp_storage, temp_storage_bytes,
+            d_keys, num_items,
+            num_segments, d_begin_offsets, d_end_offsets,
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
--- a/3rdparty/cub/device/device_select.cuh
+++ b/3rdparty/cub/device/device_select.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
+
+#include "../config.hpp"
+
+#include "../thread/thread_operators.cuh"
+
+#include <cub/rocprim/device/device_select.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+class DeviceSelect
+{
+public:
+    template <
+        typename InputIteratorT,
+        typename FlagIterator,
+        typename OutputIteratorT,
+        typename NumSelectedIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Flagged(void *d_temp_storage,
+                       size_t &temp_storage_bytes,
+                       InputIteratorT d_in,
+                       FlagIterator d_flags,
+                       OutputIteratorT d_out,
+                       NumSelectedIteratorT d_num_selected_out,
+                       int num_items,
+                       cudaStream_t stream = 0,
+                       bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::select(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_flags, d_out, d_num_selected_out, num_items,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename NumSelectedIteratorT,
+        typename SelectOp
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t If(void *d_temp_storage,
+                  size_t &temp_storage_bytes,
+                  InputIteratorT d_in,
+                  OutputIteratorT d_out,
+                  NumSelectedIteratorT d_num_selected_out,
+                  int num_items,
+                  SelectOp select_op,
+                  cudaStream_t stream = 0,
+                  bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::select(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, d_num_selected_out, num_items, select_op,
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename InputIteratorT,
+        typename OutputIteratorT,
+        typename NumSelectedIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t Unique(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      InputIteratorT d_in,
+                      OutputIteratorT d_out,
+                      NumSelectedIteratorT d_num_selected_out,
+                      int num_items,
+                      cudaStream_t stream = 0,
+                      bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::unique(
+            d_temp_storage, temp_storage_bytes,
+            d_in, d_out, d_num_selected_out, num_items, cub::Equality(),
+            stream, debug_synchronous
+        );
+    }
+
+    template <
+        typename KeyIteratorT,
+        typename ValueIteratorT,
+        typename OutputKeyIteratorT,
+        typename OutputValueIteratorT,
+        typename NumSelectedIteratorT
+    >
+    HIPCUB_RUNTIME_FUNCTION static
+    cudaError_t UniqueByKey(void *d_temp_storage,
+                           size_t &temp_storage_bytes,
+                           KeyIteratorT d_keys_input,
+                           ValueIteratorT d_values_input,
+                           OutputKeyIteratorT d_keys_output,
+                           OutputValueIteratorT d_values_output,
+                           NumSelectedIteratorT d_num_selected_out,
+                           int num_items,
+                           cudaStream_t stream = 0,
+                           bool debug_synchronous = false)
+    {
+        return (cudaError_t)::rocprim::unique_by_key(
+            d_temp_storage, temp_storage_bytes,
+            d_keys_input, d_values_input, 
+            d_keys_output, d_values_output,
+            d_num_selected_out, num_items, cub::Equality(),
+            stream, debug_synchronous
+        );
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
--- a/3rdparty/cub/device/device_spmv.cuh
+++ b/3rdparty/cub/device/device_spmv.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_
+
+#include "../config.hpp"
+
+#include "../iterator/tex_ref_input_iterator.cuh"
+
+BEGIN_HIPCUB_NAMESPACE
+
+class DeviceSpmv
+{
+
+public:
+
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
+    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+
+    ::cub::TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
+};
+
+static constexpr uint32_t CsrMVKernel_MaxThreads = 256;
+
+template <typename ValueT>
+static __global__ void
+CsrMVKernel(SpmvParams<ValueT, int> spmv_params)
+{
+    __shared__ ValueT partial;
+
+    const int32_t row_id = hipBlockIdx_x;
+
+    if(threadIdx.x == 0)
+    {
+        partial = spmv_params.beta * spmv_params.d_vector_y[row_id];
+    }
+    __syncthreads();
+
+    int32_t row_offset = (row_id == 0) ? (0) : (spmv_params.d_row_end_offsets[row_id - 1]);
+    for(uint32_t thread_offset = 0; thread_offset < spmv_params.num_cols / blockDim.x; thread_offset++)
+    {
+        int32_t offset = row_offset + thread_offset * blockDim.x + threadIdx.x;
+
+        if(offset < spmv_params.d_row_end_offsets[row_id])
+        {
+            ValueT t_value =
+                spmv_params.alpha *
+                spmv_params.d_values[offset] *
+                spmv_params.d_vector_x[spmv_params.d_column_indices[offset]];
+
+            atomicAdd(&partial, t_value);
+
+            __syncthreads();
+
+            iif(threadIdx.x == 0)
+            {
+                spmv_params.d_vector_y[row_id] = partial;
+            }
+        }
+    }
+}
+
+template <typename ValueT>
+    HIPCUB_RUNTIME_FUNCTION
+    static cudaError_t CsrMV(
+        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
+        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
+        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
+        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
+        cudaStream_t         stream                  = 0,        ///< [in] <b>[optional]</b> hip stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        SpmvParams<ValueT, int> spmv_params;
+        spmv_params.d_values             = d_values;
+        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
+        spmv_params.d_column_indices     = d_column_indices;
+        spmv_params.d_vector_x           = d_vector_x;
+        spmv_params.d_vector_y           = d_vector_y;
+        spmv_params.num_rows             = num_rows;
+        spmv_params.num_cols             = num_cols;
+        spmv_params.num_nonzeros         = num_nonzeros;
+        spmv_params.alpha                = 1.0;
+        spmv_params.beta                 = 0.0;
+
+        cudaError_t status;
+        if(d_temp_storage == nullptr)
+        {
+            // Make sure user won't try to allocate 0 bytes memory, because
+            // hipMalloc will return nullptr when size is zero.
+            temp_storage_bytes = 4;
+            return cudaError_t(0);
+        }
+        else
+        {
+            size_t block_size = min(num_cols, DeviceSpmv::CsrMVKernel_MaxThreads);
+            size_t grid_size = num_rows;
+            CsrMVKernel<<<grid_size, block_size, 0, stream>>>(spmv_params);
+            status = hipGetLastError();
+        }
+        return status;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_CUB_DEVICE_DEVICE_SELECT_HPP_
+
--- a/3rdparty/cub/grid/grid_barrier.cuh
+++ b/3rdparty/cub/grid/grid_barrier.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
+#define HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+#include "../thread/thread_load.cuh"
+
+BEGIN_HIPCUB_NAMESPACE
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a cuda grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+     /**
+     * @typedef SyncFlag
+     * @brief Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        __syncthreads();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            __syncthreads();
+
+            // Wait for everyone else to report in
+            for (uint32_t peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            __syncthreads();
+
+            // Let everyone know it's safe to proceed
+            for (uint32_t peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            __syncthreads();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            retval = cudaFree(d_sync);
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if ((retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if ((retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if ((retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
--- a/3rdparty/cub/grid/grid_even_share.cuh
+++ b/3rdparty/cub/grid/grid_even_share.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
+#define HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "grid_mapping.cuh"
+#include "../util_type.cuh"
+
+BEGIN_HIPCUB_NAMESPACE
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * \par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+
+    int         total_tiles;
+    int         big_shares;
+    OffsetT     big_share_items;
+    OffsetT     normal_share_items;
+    OffsetT     normal_base_offset;
+
+public:
+
+    /// Total number of input items
+    OffsetT     num_items;
+
+    /// Grid size in thread blocks
+    int         grid_size;
+
+    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+    OffsetT     block_offset;
+
+    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    OffsetT     block_end;
+
+    /// Stride between input tiles
+    OffsetT     block_stride;
+
+
+    /**
+     * \brief Constructor.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        total_tiles(0),
+        big_shares(0),
+        big_share_items(0),
+        normal_share_items(0),
+        normal_base_offset(0),
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_end(0),
+        block_stride(0)
+    {}
+
+
+    /**
+     * \brief Dispatch initializer. To be called prior to kernel launch.
+     */
+    __host__ __device__ __forceinline__ void DispatchInit(
+        OffsetT num_items_,          ///< Total number of input items
+        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     tile_items)         ///< Number of data items per input tile
+    {
+        this->block_offset          = num_items_;    // Initialize past-the-end
+        this->block_end             = num_items_;    // Initialize past-the-end
+        this->num_items             = num_items_;
+        this->total_tiles           = static_cast<int>(cub::DivideAndRoundUp(num_items_, tile_items));
+        this->grid_size             = min(total_tiles, max_grid_size);
+        int avg_tiles_per_block     = total_tiles / grid_size;
+        // leftover grains go to big blocks:
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);
+        this->normal_share_items    = avg_tiles_per_block * tile_items;
+        this->normal_base_offset    = big_shares * tile_items;
+        this->big_share_items       = normal_share_items + tile_items;
+    }
+
+
+    /**
+     * \brief Initializes ranges for the specified thread block index.  Specialized
+     * for a "raking" access pattern in which each thread block is assigned a
+     * consecutive sequence of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    {
+        block_stride = TILE_ITEMS;
+        if (block_id < big_shares)
+        {
+            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+            block_offset = (block_id * big_share_items);
+            block_end = block_offset + big_share_items;
+        }
+        else if (block_id < total_tiles)
+        {
+            // This thread block gets a normal share of grains (avg_tiles_per_block)
+            block_offset = normal_base_offset + (block_id * normal_share_items);
+            block_end = min(num_items, block_offset + normal_share_items);
+        }
+        // Else default past-the-end
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    {
+        block_stride = grid_size * TILE_ITEMS;
+        block_offset = (block_id * TILE_ITEMS);
+        block_end = num_items;
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for "strip mining" access
+     * pattern in which the input tiles assigned to each thread block are
+     * separated by a stride equal to the the extent of the grid.
+     */
+    template <
+        int TILE_ITEMS,
+        GridMappingStrategy STRATEGY>
+    __device__ __forceinline__ void BlockInit()
+    {
+        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        this->block_offset = block_offset;
+        this->block_end = block_end;
+        this->block_stride = TILE_ITEMS;
+    }
+
+
+};
+
+
+/** @} */       // end group GridModule
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
--- a/3rdparty/cub/grid/grid_mapping.cuh
+++ b/3rdparty/cub/grid/grid_mapping.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
+#define HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../thread/thread_load.cuh"
+
+BEGIN_HIPCUB_NAMESPACE
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An a "raking" access pattern in which each thread block is
+     * assigned a consecutive sequence of input tiles
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_RAKE,
+
+    /**
+     * \brief An a "strip mining" access pattern in which the input tiles assigned
+     * to each thread block are separated by a stride equal to the the extent of
+     * the grid.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p sets, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each set is comprised of
+     * data tiles separated by stride \p tiles, where a tile is a small,
+     * constant-sized unit of input to be processed to completion before the
+     * thread block terminates or obtains more work.  The kernel invokes \p p
+     * thread blocks, each of which iteratively consumes a segment of
+     * <em>n</em>/<em>p</em> elements in tile-size increments.
+     */
+    GRID_MAPPING_STRIP_MINE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
--- a/3rdparty/cub/grid/grid_queue.cuh
+++ b/3rdparty/cub/grid/grid_queue.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
+#define HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+
+BEGIN_HIPCUB_NAMESPACE
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    OffsetT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(OffsetT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor
+    __host__ __device__ __forceinline__ GridQueue()
+    :
+        d_counters(NULL)
+    {}
+
+
+    /// Constructs a GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage) ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((OffsetT*) d_storage)
+    {}
+
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    HIPCUB_DEVICE cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        (void)stream;
+        d_counters[FILL] = fill_size;
+        d_counters[DRAIN] = 0;
+        result = cudaSuccess;
+        return result;
+    }
+
+    HIPCUB_HOST cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        OffsetT counters[2];
+        counters[FILL] = fill_size;
+        counters[DRAIN] = 0;
+        result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+        return result;
+    }
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    HIPCUB_DEVICE cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        (void)stream;
+        d_counters[DRAIN] = 0;
+        result = cudaSuccess;
+        return result;
+    }
+
+    HIPCUB_HOST cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+        return result;
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    HIPCUB_DEVICE cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        (void)stream;
+        d_counters[FILL] = 0;
+        result = cudaSuccess;
+        return result;
+    }
+
+    HIPCUB_HOST cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+        return result;
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    HIPCUB_DEVICE cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        (void)stream;
+        fill_size = d_counters[FILL];
+        result = cudaSuccess;
+        return result;
+    }
+
+    HIPCUB_HOST cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+        return result;
+    }
+
+
+    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from cuda kernel.
+    HIPCUB_DEVICE OffsetT Drain(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from cuda kernel.
+    HIPCUB_DEVICE OffsetT Fill(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(
+    GridQueue<OffsetT>   grid_queue,
+    OffsetT              num_items)
+{
+    grid_queue.FillAndResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
--- a/3rdparty/cub/iterator/arg_index_input_iterator.cuh
+++ b/3rdparty/cub/iterator/arg_index_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.hpp"
+
+#include <cub/rocprim/iterator/arg_index_iterator.hpp>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+BEGIN_HIPCUB_NAMESPACE
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template<
+    typename InputIterator,
+    typename Difference = std::ptrdiff_t,
+    typename Value = typename std::iterator_traits<InputIterator>::value_type
+>
+using ArgIndexInputIterator = ::rocprim::arg_index_iterator<InputIterator, Difference, Value>;
+
+#endif
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/cache_modified_input_iterator.cuh
+++ b/3rdparty/cub/iterator/cache_modified_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../util_type.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+BEGIN_HIPCUB_NAMESPACE
+
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedInputIterator          self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+
+public:
+
+    /// Wrapped native pointer
+    ValueType* ptr;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
+        ValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename std::remove_cv<ValueType>::type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __device__ __forceinline__ reference operator*() const
+    {
+        return ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return ThreadLoad<MODIFIER>(ptr + n);
+    }
+
+    /// Structure dereference
+    __device__ __forceinline__ pointer operator->()
+    {
+        return &ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+
+#endif
+
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/cache_modified_output_iterator.cuh
+++ b/3rdparty/cub/iterator/cache_modified_output_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_OUTPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_OUTPUT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_type.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+BEGIN_HIPCUB_NAMESPACE
+
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+
+    // Proxy object
+    struct Reference
+    {
+        ValueType* ptr;
+
+        /// Constructor
+        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
+
+        /// Assignment
+        __device__ __forceinline__ ValueType operator =(ValueType val)
+        {
+            ThreadStore<MODIFIER>(ptr, val);
+            return val;
+        }
+    };
+
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                                value_type;             ///< The type of the element the iterator can point to
+    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+
+private:
+
+    ValueType* ptr;
+
+public:
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return Reference(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return Reference(ptr + n);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        (void)itr;
+        return os;
+    }
+
+#endif
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif