initial llama

0211193c · zhuwenwen · 0211193c · 0211193c · 0211193c · 0211193c
Commit 0211193c authored Aug 17, 2023 by zhuwenwen
20 changed files
--- a/3rdparty/cub/device/device_spmv.cuh
+++ b/3rdparty/cub/device/device_spmv.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_
+#include "../config.hpp"
+#include "../iterator/tex_ref_input_iterator.cuh"
+BEGIN_HIPCUB_NAMESPACE
+class DeviceSpmv
+{
+public:
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
+    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+    ::cub::TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
+};
+static constexpr uint32_t CsrMVKernel_MaxThreads = 256;
+template <typename ValueT>
+static __global__ void
+CsrMVKernel(SpmvParams<ValueT, int> spmv_params)
+{
+    __shared__ ValueT partial;
+    const int32_t row_id = hipBlockIdx_x;
+    if(threadIdx.x == 0)
+    {
+        partial = spmv_params.beta * spmv_params.d_vector_y[row_id];
+    }
+    __syncthreads();
+    int32_t row_offset = (row_id == 0) ? (0) : (spmv_params.d_row_end_offsets[row_id - 1]);
+    for(uint32_t thread_offset = 0; thread_offset < spmv_params.num_cols / blockDim.x; thread_offset++)
+    {
+        int32_t offset = row_offset + thread_offset * blockDim.x + threadIdx.x;
+        if(offset < spmv_params.d_row_end_offsets[row_id])
+        {
+            ValueT t_value =
+                spmv_params.alpha *
+                spmv_params.d_values[offset] *
+                spmv_params.d_vector_x[spmv_params.d_column_indices[offset]];
+            atomicAdd(&partial, t_value);
+            __syncthreads();
+            iif(threadIdx.x == 0)
+            {
+                spmv_params.d_vector_y[row_id] = partial;
+            }
+        }
+    }
+}
+template <typename ValueT>
+    HIPCUB_RUNTIME_FUNCTION
+    static cudaError_t CsrMV(
+        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
+        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
+        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
+        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
+        cudaStream_t         stream                  = 0,        ///< [in] <b>[optional]</b> hip stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        SpmvParams<ValueT, int> spmv_params;
+        spmv_params.d_values             = d_values;
+        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
+        spmv_params.d_column_indices     = d_column_indices;
+        spmv_params.d_vector_x           = d_vector_x;
+        spmv_params.d_vector_y           = d_vector_y;
+        spmv_params.num_rows             = num_rows;
+        spmv_params.num_cols             = num_cols;
+        spmv_params.num_nonzeros         = num_nonzeros;
+        spmv_params.alpha                = 1.0;
+        spmv_params.beta                 = 0.0;
+        cudaError_t status;
+        if(d_temp_storage == nullptr)
+        {
+            // Make sure user won't try to allocate 0 bytes memory, because
+            // hipMalloc will return nullptr when size is zero.
+            temp_storage_bytes = 4;
+            return cudaError_t(0);
+        }
+        else
+        {
+            size_t block_size = min(num_cols, DeviceSpmv::CsrMVKernel_MaxThreads);
+            size_t grid_size = num_rows;
+            CsrMVKernel<<<grid_size, block_size, 0, stream>>>(spmv_params);
+            status = hipGetLastError();
+        }
+        return status;
+    }
+};
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_CUB_DEVICE_DEVICE_SELECT_HPP_
--- a/3rdparty/cub/grid/grid_barrier.cuh
+++ b/3rdparty/cub/grid/grid_barrier.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
+#define HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../thread/thread_load.cuh"
+BEGIN_HIPCUB_NAMESPACE
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a cuda grid
+ */
+class GridBarrier
+{
+protected :
+    typedef unsigned int SyncFlag;
+    // Counters in global device memory
+    SyncFlag* d_sync;
+public:
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+     /**
+     * @typedef SyncFlag
+     * @brief Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        __syncthreads();
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+            __syncthreads();
+            // Wait for everyone else to report in
+            for (uint32_t peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+            __syncthreads();
+            // Let everyone know it's safe to proceed
+            for (uint32_t peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+            __syncthreads();
+        }
+    }
+};
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+public:
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            retval = cudaFree(d_sync);
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if ((retval = cudaFree(d_sync))) break;
+                }
+                sync_bytes = new_sync_bytes;
+                // Allocate and initialize to zero
+                if ((retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if ((retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+        return retval;
+    }
+};
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
--- a/3rdparty/cub/grid/grid_even_share.cuh
+++ b/3rdparty/cub/grid/grid_even_share.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
+#define HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "grid_mapping.cuh"
+#include "../util_type.cuh"
+BEGIN_HIPCUB_NAMESPACE
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * \par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+    int         total_tiles;
+    int         big_shares;
+    OffsetT     big_share_items;
+    OffsetT     normal_share_items;
+    OffsetT     normal_base_offset;
+public:
+    /// Total number of input items
+    OffsetT     num_items;
+    /// Grid size in thread blocks
+    int         grid_size;
+    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+    OffsetT     block_offset;
+    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    OffsetT     block_end;
+    /// Stride between input tiles
+    OffsetT     block_stride;
+    /**
+     * \brief Constructor.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        total_tiles(0),
+        big_shares(0),
+        big_share_items(0),
+        normal_share_items(0),
+        normal_base_offset(0),
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_end(0),
+        block_stride(0)
+    {}
+    /**
+     * \brief Dispatch initializer. To be called prior to kernel launch.
+     */
+    __host__ __device__ __forceinline__ void DispatchInit(
+        OffsetT num_items_,          ///< Total number of input items
+        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     tile_items)         ///< Number of data items per input tile
+    {
+        this->block_offset          = num_items_;    // Initialize past-the-end
+        this->block_end             = num_items_;    // Initialize past-the-end
+        this->num_items             = num_items_;
+        this->total_tiles           = static_cast<int>(cub::DivideAndRoundUp(num_items_, tile_items));
+        this->grid_size             = min(total_tiles, max_grid_size);
+        int avg_tiles_per_block     = total_tiles / grid_size;
+        // leftover grains go to big blocks:
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);
+        this->normal_share_items    = avg_tiles_per_block * tile_items;
+        this->normal_base_offset    = big_shares * tile_items;
+        this->big_share_items       = normal_share_items + tile_items;
+    }
+    /**
+     * \brief Initializes ranges for the specified thread block index.  Specialized
+     * for a "raking" access pattern in which each thread block is assigned a
+     * consecutive sequence of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    {
+        block_stride = TILE_ITEMS;
+        if (block_id < big_shares)
+        {
+            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+            block_offset = (block_id * big_share_items);
+            block_end = block_offset + big_share_items;
+        }
+        else if (block_id < total_tiles)
+        {
+            // This thread block gets a normal share of grains (avg_tiles_per_block)
+            block_offset = normal_base_offset + (block_id * normal_share_items);
+            block_end = min(num_items, block_offset + normal_share_items);
+        }
+        // Else default past-the-end
+    }
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    {
+        block_stride = grid_size * TILE_ITEMS;
+        block_offset = (block_id * TILE_ITEMS);
+        block_end = num_items;
+    }
+    /**
+     * \brief Block-initialization, specialized for "strip mining" access
+     * pattern in which the input tiles assigned to each thread block are
+     * separated by a stride equal to the the extent of the grid.
+     */
+    template <
+        int TILE_ITEMS,
+        GridMappingStrategy STRATEGY>
+    __device__ __forceinline__ void BlockInit()
+    {
+        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+    }
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        this->block_offset = block_offset;
+        this->block_end = block_end;
+        this->block_stride = TILE_ITEMS;
+    }
+};
+/** @} */       // end group GridModule
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
--- a/3rdparty/cub/grid/grid_mapping.cuh
+++ b/3rdparty/cub/grid/grid_mapping.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
+#define HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../thread/thread_load.cuh"
+BEGIN_HIPCUB_NAMESPACE
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An a "raking" access pattern in which each thread block is
+     * assigned a consecutive sequence of input tiles
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_RAKE,
+    /**
+     * \brief An a "strip mining" access pattern in which the input tiles assigned
+     * to each thread block are separated by a stride equal to the the extent of
+     * the grid.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p sets, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each set is comprised of
+     * data tiles separated by stride \p tiles, where a tile is a small,
+     * constant-sized unit of input to be processed to completion before the
+     * thread block terminates or obtains more work.  The kernel invokes \p p
+     * thread blocks, each of which iteratively consumes a segment of
+     * <em>n</em>/<em>p</em> elements in tile-size increments.
+     */
+    GRID_MAPPING_STRIP_MINE,
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+/** @} */       // end group GridModule
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
--- a/3rdparty/cub/grid/grid_queue.cuh
+++ b/3rdparty/cub/grid/grid_queue.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
+#define HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
+#include <type_traits>
+#include "../config.hpp"
+BEGIN_HIPCUB_NAMESPACE
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+    /// Pair of counters
+    OffsetT *d_counters;
+public:
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(OffsetT) * 2;
+    }
+    /// Constructs an invalid GridQueue descriptor
+    __host__ __device__ __forceinline__ GridQueue()
+    :
+        d_counters(NULL)
+    {}
+    /// Constructs a GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage) ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((OffsetT*) d_storage)
+    {}
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    HIPCUB_DEVICE cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        (void)stream;
+        d_counters[FILL] = fill_size;
+        d_counters[DRAIN] = 0;
+        result = cudaSuccess;
+        return result;
+    }
+    HIPCUB_HOST cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        OffsetT counters[2];
+        counters[FILL] = fill_size;
+        counters[DRAIN] = 0;
+        result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+        return result;
+    }
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    HIPCUB_DEVICE cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        (void)stream;
+        d_counters[DRAIN] = 0;
+        result = cudaSuccess;
+        return result;
+    }
+    HIPCUB_HOST cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+        return result;
+    }
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    HIPCUB_DEVICE cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        (void)stream;
+        d_counters[FILL] = 0;
+        result = cudaSuccess;
+        return result;
+    }
+    HIPCUB_HOST cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+        return result;
+    }
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    HIPCUB_DEVICE cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        (void)stream;
+        fill_size = d_counters[FILL];
+        result = cudaSuccess;
+        return result;
+    }
+    HIPCUB_HOST cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+        return result;
+    }
+    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from cuda kernel.
+    HIPCUB_DEVICE OffsetT Drain(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from cuda kernel.
+    HIPCUB_DEVICE OffsetT Fill(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(
+    GridQueue<OffsetT>   grid_queue,
+    OffsetT              num_items)
+{
+    grid_queue.FillAndResetDrain(num_items);
+}
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+/** @} */       // end group GridModule
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
--- a/3rdparty/cub/iterator/arg_index_input_iterator.cuh
+++ b/3rdparty/cub/iterator/arg_index_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
+#include <iterator>
+#include <iostream>
+#include "../config.hpp"
+#include <cub/rocprim/iterator/arg_index_iterator.hpp>
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+BEGIN_HIPCUB_NAMESPACE
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+template<
+    typename InputIterator,
+    typename Difference = std::ptrdiff_t,
+    typename Value = typename std::iterator_traits<InputIterator>::value_type
+>
+using ArgIndexInputIterator = ::rocprim::arg_index_iterator<InputIterator, Difference, Value>;
+#endif
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/cache_modified_input_iterator.cuh
+++ b/3rdparty/cub/iterator/cache_modified_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
+#include <iterator>
+#include <iostream>
+#include "../thread/thread_load.cuh"
+#include "../util_type.cuh"
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+BEGIN_HIPCUB_NAMESPACE
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+    // Required iterator traits
+    typedef CacheModifiedInputIterator          self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+public:
+    /// Wrapped native pointer
+    ValueType* ptr;
+    /// Constructor
+    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
+        ValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename std::remove_cv<ValueType>::type *>(ptr))
+    {}
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+    /// Indirection
+    __device__ __forceinline__ reference operator*() const
+    {
+        return ThreadLoad<MODIFIER>(ptr);
+    }
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+    /// Array subscript
+    template <typename Distance>
+    __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return ThreadLoad<MODIFIER>(ptr + n);
+    }
+    /// Structure dereference
+    __device__ __forceinline__ pointer operator->()
+    {
+        return &ThreadLoad<MODIFIER>(ptr);
+    }
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+#endif
+};
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/cache_modified_output_iterator.cuh
+++ b/3rdparty/cub/iterator/cache_modified_output_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_OUTPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_OUTPUT_ITERATOR_HPP_
+#include <iterator>
+#include <iostream>
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_type.cuh"
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+BEGIN_HIPCUB_NAMESPACE
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+    // Proxy object
+    struct Reference
+    {
+        ValueType* ptr;
+        /// Constructor
+        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
+        /// Assignment
+        __device__ __forceinline__ ValueType operator =(ValueType val)
+        {
+            ThreadStore<MODIFIER>(ptr, val);
+            return val;
+        }
+    };
+public:
+    // Required iterator traits
+    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                                value_type;             ///< The type of the element the iterator can point to
+    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+private:
+    ValueType* ptr;
+public:
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type *>(ptr))
+    {}
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return Reference(ptr);
+    }
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return Reference(ptr + n);
+    }
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        (void)itr;
+        return os;
+    }
+#endif
+};
+END_HIPCUB_NAMESPACE
+#endif
--- a/3rdparty/cub/iterator/constant_input_iterator.cuh
+++ b/3rdparty/cub/iterator/constant_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
+#include <iterator>
+#include <iostream>
+#include "../config.hpp"
+#include <cub/rocprim/iterator/constant_iterator.hpp>
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+BEGIN_HIPCUB_NAMESPACE
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+template<
+    typename ValueType,
+    typename OffsetT = std::ptrdiff_t
+>
+using ConstantInputIterator = ::rocprim::constant_iterator<ValueType, OffsetT>;
+#endif
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/counting_input_iterator.cuh
+++ b/3rdparty/cub/iterator/counting_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
+#include <iterator>
+#include <iostream>
+#include "../config.hpp"
+#include <cub/rocprim/iterator/counting_iterator.hpp>
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+BEGIN_HIPCUB_NAMESPACE
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+template<
+    typename ValueType,
+    typename OffsetT = std::ptrdiff_t
+>
+using CountingInputIterator = ::rocprim::counting_iterator<ValueType, OffsetT>;
+#endif
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/discard_output_iterator.cuh
+++ b/3rdparty/cub/iterator/discard_output_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
+#include <iterator>
+#include <iostream>
+#include "../config.hpp"
+BEGIN_HIPCUB_NAMESPACE
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+/**
+ * \brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+    // Required iterator traits
+    typedef DiscardOutputIterator   self_type;              ///< My own type
+    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                    value_type;             ///< The type of the element the iterator can point to
+    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+private:
+    OffsetT offset;
+public:
+    /// Constructor
+    __host__ __device__ __forceinline__ DiscardOutputIterator(
+        OffsetT offset = 0)     ///< Base offset
+    :
+        offset(offset)
+    {}
+    /**
+    * @typedef self_type
+    * @brief Postfix increment
+    */
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+    /**
+    * @typedef self_type
+    * @brief Postfix increment
+    */
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+    /**
+    * @typedef self_type
+    * @brief Indirection
+    */
+    __host__ __device__ __forceinline__ self_type& operator*()
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+    /**
+    * @typedef self_type
+    * @brief Addition
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(offset + n);
+        return retval;
+    }
+    /**
+    * @typedef self_type
+    * @brief Addition assignment
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+    /**
+    * @typedef self_type
+    * @brief Subtraction assignment
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(offset - n);
+        return retval;
+    }
+    /**
+    * @typedef self_type
+    * @brief Subtraction assignment
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+    /**
+    * @typedef self_type
+    * @brief Distance
+    */
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+    /**
+    * @typedef self_type
+    * @brief Array subscript
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator[](Distance)
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return;
+    }
+    /// Assignment to anything else (no-op)
+    template<typename T>
+    __host__ __device__ __forceinline__ void operator=(T const&)
+    {}
+    /// Cast to void* operator
+    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
+    /**
+    * @typedef self_type
+    * @brief Equal to
+    */
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset);
+    }
+    /**
+    * @typedef self_type
+    * @brief Not equal to
+    */
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset);
+    }
+    /**
+    * @typedef self_type
+    * @brief ostream operator
+    */
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.offset << "]";
+        return os;
+    }
+};
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/tex_obj_input_iterator.cuh
+++ b/3rdparty/cub/iterator/tex_obj_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
+#include <iterator>
+#include <iostream>
+#include "../config.hpp"
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+#include <cub/rocprim/iterator/texture_cache_iterator.hpp>
+BEGIN_HIPCUB_NAMESPACE
+template<
+    typename T,
+    typename OffsetT = std::ptrdiff_t
+>
+class TexObjInputIterator : public ::rocprim::texture_cache_iterator<T, OffsetT>
+{
+    public:
+    template<class Qualified>
+    inline
+    cudaError_t BindTexture(Qualified* ptr,
+                           size_t bytes = size_t(-1),
+                           size_t texture_offset = 0)
+    {
+        return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::bind_texture(ptr, bytes, texture_offset);
+    }
+    inline cudaError_t UnbindTexture()
+    {
+        return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::unbind_texture();
+    }
+    HIPCUB_HOST_DEVICE inline
+    ~TexObjInputIterator() = default;
+    HIPCUB_HOST_DEVICE inline
+    TexObjInputIterator() : ::rocprim::texture_cache_iterator<T, OffsetT>()
+    {
+    }
+    HIPCUB_HOST_DEVICE inline
+    TexObjInputIterator(const ::rocprim::texture_cache_iterator<T, OffsetT> other)
+        : ::rocprim::texture_cache_iterator<T, OffsetT>(other)
+    {
+    }
+};
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/tex_ref_input_iterator.cuh
+++ b/3rdparty/cub/iterator/tex_ref_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
+#include <iterator>
+#include <iostream>
+#include "../config.hpp"
+#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+#include <rocprim/iterator/texture_cache_iterator.hpp>
+BEGIN_HIPCUB_NAMESPACE
+template<
+    typename T,
+    int UNIQUE_ID, // Unused parameter for compatibility with original definition in cub
+    typename OffsetT = std::ptrdiff_t
+>
+class TexRefInputIterator : public ::rocprim::texture_cache_iterator<T, OffsetT>
+{
+    public:
+    template<class Qualified>
+    inline
+    cudaError_t BindTexture(Qualified* ptr,
+                           size_t bytes = size_t(-1),
+                           size_t texture_offset = 0)
+    {
+        return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::bind_texture(ptr, bytes, texture_offset);
+    }
+    inline cudaError_t UnbindTexture()
+    {
+        return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::unbind_texture();
+    }
+    HIPCUB_HOST_DEVICE inline
+    ~TexRefInputIterator() = default;
+    HIPCUB_HOST_DEVICE inline
+    TexRefInputIterator() : ::rocprim::texture_cache_iterator<T, OffsetT>()
+    {
+    }
+    HIPCUB_HOST_DEVICE inline
+    TexRefInputIterator(const ::rocprim::texture_cache_iterator<T, OffsetT> other)
+        : ::rocprim::texture_cache_iterator<T, OffsetT>(other)
+    {
+    }
+};
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/transform_input_iterator.cuh
+++ b/3rdparty/cub/iterator/transform_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#ifndef HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
+#include <iterator>
+#include <iostream>
+#include "../config.hpp"
+#include <cub/rocprim/iterator/transform_iterator.hpp>
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+BEGIN_HIPCUB_NAMESPACE
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+template<
+    typename ValueType,
+    typename ConversionOp,
+    typename InputIteratorT,
+    typename OffsetT = std::ptrdiff_t // ignored
+>
+using TransformInputIterator = ::rocprim::transform_iterator<InputIteratorT, ConversionOp, ValueType>;
+#endif
+END_HIPCUB_NAMESPACE
+#endif // HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/rocprim/block/block_adjacent_difference.hpp
+++ b/3rdparty/cub/rocprim/block/block_adjacent_difference.hpp
+/******************************************************************************
+* Copyright (c) 2011, Duane Merrill.  All rights reserved.
+* Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+* Modifications Copyright (c) 2022, Advanced Micro Devices, Inc.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+#ifndef ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
+#define ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
+#include "detail/block_adjacent_difference_impl.hpp"
+#include "../config.hpp"
+#include "../detail/various.hpp"
+/// \addtogroup blockmodule
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief The \p block_adjacent_difference class is a block level parallel primitive which provides
+/// methods for applying binary functions for pairs of consecutive items partition across a thread
+/// block.
+///
+/// \tparam T - the input type.
+/// \tparam BlockSize - the number of threads in a block.
+///
+/// \par Overview
+/// * There are two types of flags:
+///   * Head flags.
+///   * Tail flags.
+/// * The above flags are used to differentiate items from their predecessors or successors.
+/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
+/// segmented reduction/scan.
+///
+/// \par Examples
+/// \parblock
+/// In the examples discontinuity operation is performed on block of 128 threads, using type
+/// \p int.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize discontinuity for int and a block of 128 threads
+///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+///     // allocate storage in shared memory
+///     __shared__ block_adjacent_difference_int::storage_type storage;
+///
+///     // segment of consecutive items to be used
+///     int input[8];
+///     ...
+///     int head_flags[8];
+///     block_adjacent_difference_int b_discontinuity;
+///     using flag_op_type = typename rocprim::greater<int>;
+///     b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_adjacent_difference
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
+    : private detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+{
+    using base_type = detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
+    static constexpr unsigned BlockSize = base_type::BlockSize;
+    // Struct used for creating a raw_storage object for this primitive's temporary storage.
+    struct storage_type_
+    {
+        typename base_type::storage_type left;
+        typename base_type::storage_type right;
+    };
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = detail::raw_storage<storage_type_>;
+    #else
+    using storage_type = storage_type_;
+    #endif
+    /// \brief Tags \p head_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the first item has no reference and is always
+    /// flagged.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_left() or block_discontinuity::flag_heads() instead.
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int head_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_left or block_discontinuity.flag_heads instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, head_flags, flag_op, input[0] /* predecessor */, storage.get().left);
+    }
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_left() or block_discontinuity::flag_heads() instead.
+    /// This overload does not take a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_left or block_discontinuity.flag_heads instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads(head_flags, input, flag_op, storage);
+    }
+    /// \brief Tags \p head_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the first item of the first thread is compared against
+    /// a \p tile_predecessor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_left() or block_discontinuity::flag_heads() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
+    ///                                storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_left or block_discontinuity.flag_heads instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    T tile_predecessor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+    }
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_left() or block_discontinuity::flag_heads() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_left or block_discontinuity.flag_heads instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    T tile_predecessor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads(head_flags, tile_predecessor_item, input, flag_op, storage);
+    }
+    /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the last item has no reference and is always
+    /// flagged.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_right() or block_discontinuity::flag_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_right or block_discontinuity.flag_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags       = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_successor = false;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, tail_flags, flag_op, input[0] /* successor */, storage.get().right);
+    }
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_right() or block_discontinuity::flag_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_right or block_discontinuity.flag_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_tails(tail_flags, input, flag_op, storage);
+    }
+    /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the last item of the last thread is compared against
+    /// a \p tile_successor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_right() or block_discontinuity::flag_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
+    ///                                storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_right or block_discontinuity.flag_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    T tile_successor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags       = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_successor = true;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_right() or block_discontinuity::flag_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_right or block_discontinuity.flag_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    T tile_successor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_tails(tail_flags, tile_successor_item, input, flag_op, storage);
+    }
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
+    ///                                          flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+        static constexpr auto with_successor   = false;
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
+    }
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tail_flags, input, flag_op, storage);
+    }
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the last item of the
+    /// last thread is compared against a \p tile_successor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+        static constexpr auto with_successor   = true;
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, flag_op, storage);
+    }
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the first item of the
+    /// first thread is compared against a \p tile_predecessor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+        static constexpr auto with_successor   = false;
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
+    }
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage);
+    }
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the first and last items of
+    /// the first and last thread is compared against a \p tile_predecessor_item and
+    /// a \p tile_successor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_predecessor_item = 0;
+    ///     int tile_successor_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_predecessor_item = ...
+    ///         tile_successor_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
+    ///                                          tail_flags, tile_successor_item,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+        static constexpr auto with_successor   = true;
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(
+            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+            input, flag_op, storage
+        );
+    }
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the left item.
+    ///
+    /// The first item in the first thread is copied from the input then for the rest the following
+    /// code applies.
+    /// \code
+    /// // For each i in [1, block_size * ItemsPerThread) across threads in a block
+    /// output[i] = op(input[i], input[i-1]);
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param storage reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left(const T (&input)[ItemsPerThread],
+                                                     Output (&output)[ItemsPerThread],
+                                                     const BinaryFunction op,
+                                                     storage_type&        storage)
+    {
+        static constexpr auto as_flags         = false;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, output, op, input[0] /* predecessor */, storage.get().left);
+    }
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the left item, with an explicit item before 
+    /// the tile.
+    ///
+    /// \code
+    /// // For the first item on the first thread use the tile predecessor
+    /// output[0] = op(input[0], tile_predecessor)
+    /// // For other items, i in [1, block_size * ItemsPerThread) across threads in a block
+    /// output[i] = op(input[i], input[i-1]);
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] tile_predecessor - the item before the tile, will be used as the input 
+    /// of the first application of `op`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left(const T (&input)[ItemsPerThread],
+                                                     Output (&output)[ItemsPerThread],
+                                                     const BinaryFunction op,
+                                                     const T              tile_predecessor,
+                                                     storage_type&        storage)
+    {
+        static constexpr auto as_flags         = false;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, output, op, tile_predecessor, storage.get().left);
+    }
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the left item, in a partial tile.
+    ///
+    /// \code
+    /// output[0] = input[0]
+    /// // For each item i in [1, valid_items) across threads in a block
+    /// output[i] = op(input[i], input[i-1]);
+    /// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
+    /// output[i] = input[i]
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] valid_items - number of items in the block which are considered "valid" and will
+    /// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left_partial(const T (&input)[ItemsPerThread],
+                                                             Output (&output)[ItemsPerThread],
+                                                             const BinaryFunction op,
+                                                             const unsigned int   valid_items,
+                                                             storage_type&        storage)
+    {
+        static constexpr auto as_flags         = false;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+        base_type::template apply_left_partial<as_flags, reversed, with_predecessor>(
+            input, output, op, input[0] /* predecessor */, valid_items, storage.get().left);
+    }
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the left item, in a partial tile with a
+    /// predecessor.
+    ///
+    /// This combines subtract_left_partial() with a tile predecessor.
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] tile_predecessor - the item before the tile, will be used as the input 
+    /// of the first application of `op`
+    /// \param [in] valid_items - number of items in the block which are considered "valid" and will
+    /// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left_partial(const T (&input)[ItemsPerThread],
+                                                             Output (&output)[ItemsPerThread],
+                                                             const BinaryFunction op,
+                                                             const T              tile_predecessor,
+                                                             const unsigned int   valid_items,
+                                                             storage_type&        storage)
+    {
+        static constexpr auto as_flags         = false;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+        base_type::template apply_left_partial<as_flags, reversed, with_predecessor>(
+            input, output, op, tile_predecessor, valid_items, storage.get().left);
+    }
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the right item.
+    ///
+    /// The last item in the last thread is copied from the input then for the rest the following
+    /// code applies.
+    /// \code
+    /// // For each i in [0, block_size * ItemsPerThread - 1) across threads in a block
+    /// output[i] = op(input[i], input[i+1]);
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right(const T (&input)[ItemsPerThread],
+                                                      Output (&output)[ItemsPerThread],
+                                                      const BinaryFunction op,
+                                                      storage_type&        storage)
+    {
+        static constexpr auto as_flags       = false;
+        static constexpr auto reversed       = false;
+        static constexpr auto with_successor = false;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, output, op, input[0] /* successor */, storage.get().right);
+    }
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the right item, with an explicit item after 
+    /// the tile.
+    ///
+    /// \code
+    /// // For each items i in [0, block_size * ItemsPerThread - 1) across threads in a block
+    /// output[i] = op(input[i], input[i+1]);
+    /// // For the last item on the last thread use the tile successor
+    /// output[block_size * ItemsPerThread - 1] =
+    ///      op(input[block_size * ItemsPerThread - 1], tile_successor)
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] tile_successor - the item after the tile, will be used as the input 
+    /// of the last application of `op`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right(const T (&input)[ItemsPerThread],
+                                                      Output (&output)[ItemsPerThread],
+                                                      const BinaryFunction op,
+                                                      const T              tile_successor,
+                                                      storage_type&        storage)
+    {
+        static constexpr auto as_flags       = false;
+        static constexpr auto reversed       = false;
+        static constexpr auto with_successor = true;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, output, op, tile_successor, storage.get().right);
+    }
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the right item, in a partial tile.
+    ///
+    /// \code
+    /// // For each item i in [0, valid_items) across threads in a block
+    /// output[i] = op(input[i], input[i + 1]);
+    /// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
+    /// output[i] = input[i]
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] valid_items - number of items in the block which are considered "valid" and will
+    /// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right_partial(const T (&input)[ItemsPerThread],
+                                                              Output (&output)[ItemsPerThread],
+                                                              const BinaryFunction op,
+                                                              const unsigned int   valid_items,
+                                                              storage_type&        storage)
+    {
+        static constexpr auto as_flags = false;
+        static constexpr auto reversed = false;
+        base_type::template apply_right_partial<as_flags, reversed>(
+            input, output, op, valid_items, storage.get().right);
+    }
+};
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group blockmodule
+#endif // ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
--- a/3rdparty/cub/rocprim/block/block_discontinuity.hpp
+++ b/3rdparty/cub/rocprim/block/block_discontinuity.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
+#define ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
+#include "detail/block_adjacent_difference_impl.hpp"
+#include "../config.hpp"
+#include "../detail/various.hpp"
+/// \addtogroup blockmodule
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief The \p block_discontinuity class is a block level parallel primitive which provides
+/// methods for flagging items that are discontinued within an ordered set of items across
+/// threads in a block.
+///
+/// \tparam T - the input type.
+/// \tparam BlockSize - the number of threads in a block.
+///
+/// \par Overview
+/// * There are two types of flags:
+///   * Head flags.
+///   * Tail flags.
+/// * The above flags are used to differentiate items from their predecessors or successors.
+/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
+/// segmented reduction/scan.
+///
+/// \par Examples
+/// \parblock
+/// In the examples discontinuity operation is performed on block of 128 threads, using type
+/// \p int.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize discontinuity for int and a block of 128 threads
+///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+///     // allocate storage in shared memory
+///     __shared__ block_discontinuity_int::storage_type storage;
+///
+///     // segment of consecutive items to be used
+///     int input[8];
+///     ...
+///     int head_flags[8];
+///     block_discontinuity_int b_discontinuity;
+///     using flag_op_type = typename rocprim::greater<int>;
+///     b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_discontinuity
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
+    : private detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+{
+    using base_type = detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
+    static constexpr unsigned BlockSize = base_type::BlockSize;
+    // Struct used for creating a raw_storage object for this primitive's temporary storage.
+    struct storage_type_
+    {
+        typename base_type::storage_type left;
+        typename base_type::storage_type right;
+    };
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = detail::raw_storage<storage_type_>;
+    #else
+    using storage_type = storage_type_;
+    #endif
+    /// \brief Tags \p head_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the first item has no reference and is always
+    /// flagged.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int head_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = false;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, head_flags, flag_op, input[0] /* predecessor */, storage.get().left);
+    }
+    /// \overload
+    /// This overload does not take a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads(head_flags, input, flag_op, storage);
+    }
+    /// \brief Tags \p head_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the first item of the first thread is compared against
+    /// a \p tile_predecessor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
+    ///                                storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    T tile_predecessor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = true;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+    }
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    T tile_predecessor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads(head_flags, tile_predecessor_item, input, flag_op, storage);
+    }
+    /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the last item has no reference and is always
+    /// flagged.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags       = true;
+        static constexpr auto reversed       = false;
+        static constexpr auto with_successor = false;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, tail_flags, flag_op, input[0] /* successor */, storage.get().right);
+    }
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_tails(tail_flags, input, flag_op, storage);
+    }
+    /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the last item of the last thread is compared against
+    /// a \p tile_successor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
+    ///                                storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    T tile_successor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags       = true;
+        static constexpr auto reversed       = false;
+        static constexpr auto with_successor = true;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    T tile_successor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_tails(tail_flags, tile_successor_item, input, flag_op, storage);
+    }
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
+    ///                                          flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = false;
+        static constexpr auto with_successor   = false;
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
+    }
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tail_flags, input, flag_op, storage);
+    }
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the last item of the
+    /// last thread is compared against a \p tile_successor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = false;
+        static constexpr auto with_successor   = true;
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, flag_op, storage);
+    }
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the first item of the
+    /// first thread is compared against a \p tile_predecessor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = true;
+        static constexpr auto with_successor   = false;
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
+    }
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage);
+    }
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the first and last items of
+    /// the first and last thread is compared against a \p tile_predecessor_item and
+    /// a \p tile_successor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_predecessor_item = 0;
+    ///     int tile_successor_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_predecessor_item = ...
+    ///         tile_successor_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
+    ///                                          tail_flags, tile_successor_item,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = true;
+        static constexpr auto with_successor   = true;
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(
+            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+            input, flag_op, storage
+        );
+    }
+};
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group blockmodule
+#endif // ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
--- a/3rdparty/cub/rocprim/block/block_exchange.hpp
+++ b/3rdparty/cub/rocprim/block/block_exchange.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
+#define ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+/// \addtogroup blockmodule
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief The \p block_exchange class is a block level parallel primitive which provides
+/// methods for rearranging items partitioned across threads in a block.
+///
+/// \tparam T - the input type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam ItemsPerThread - the number of items contributed by each thread.
+///
+/// \par Overview
+/// * The \p block_exchange class supports the following rearrangement methods:
+///   * Transposing a blocked arrangement to a striped arrangement.
+///   * Transposing a striped arrangement to a blocked arrangement.
+///   * Transposing a blocked arrangement to a warp-striped arrangement.
+///   * Transposing a warp-striped arrangement to a blocked arrangement.
+///   * Scattering items to a blocked arrangement.
+///   * Scattering items to a striped arrangement.
+/// * Data is automatically be padded to ensure zero bank conflicts.
+///
+/// \par Examples
+/// \parblock
+/// In the examples exchange operation is performed on block of 128 threads, using type
+/// \p int with 8 items per thread.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+///     // allocate storage in shared memory
+///     __shared__ block_exchange_int::storage_type storage;
+///
+///     int items[8];
+///     ...
+///     block_exchange_int b_exchange;
+///     b_exchange.blocked_to_striped(items, items, storage);
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_exchange
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+    // Select warp size
+    static constexpr unsigned int warp_size =
+        detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
+    // Number of warps in block
+    static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size;
+    // Minimize LDS bank conflicts for power-of-two strides, i.e. when items accessed
+    // using `thread_id * ItemsPerThread` pattern where ItemsPerThread is power of two
+    // (all exchanges from/to blocked).
+    static constexpr bool has_bank_conflicts =
+        ItemsPerThread >= 2 && ::rocprim::detail::is_power_of_two(ItemsPerThread);
+    static constexpr unsigned int banks_no = ::rocprim::detail::get_lds_banks_no();
+    static constexpr unsigned int bank_conflicts_padding =
+        has_bank_conflicts ? (BlockSize * ItemsPerThread / banks_no) : 0;
+    // Struct used for creating a raw_storage object for this primitive's temporary storage.
+    struct storage_type_
+    {
+        T buffer[BlockSize * ItemsPerThread + bank_conflicts_padding];
+    };
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = detail::raw_storage<storage_type_>;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+    /// \brief Transposes a blocked arrangement of items to a striped arrangement
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void blocked_to_striped(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        blocked_to_striped(input, output, storage);
+    }
+    /// \brief Transposes a blocked arrangement of items to a striped arrangement
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.blocked_to_striped(items, items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void blocked_to_striped(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(flat_id * ItemsPerThread + i)] = input[i];
+        }
+        ::rocprim::syncthreads();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(i * BlockSize + flat_id)];
+        }
+    }
+    /// \brief Transposes a striped arrangement of items to a blocked arrangement
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void striped_to_blocked(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        striped_to_blocked(input, output, storage);
+    }
+    /// \brief Transposes a striped arrangement of items to a blocked arrangement
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.striped_to_blocked(items, items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void striped_to_blocked(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
+        }
+        ::rocprim::syncthreads();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
+        }
+    }
+    /// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
+                                 U (&output)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        blocked_to_warp_striped(input, output, storage);
+    }
+    /// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.blocked_to_warp_striped(items, items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
+                                 U (&output)[ItemsPerThread],
+                                 storage_type& storage)
+    {
+        constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
+        const unsigned int lane_id = ::rocprim::lane_id();
+        const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        const unsigned int current_warp_size = get_current_warp_size();
+        const unsigned int offset = warp_id * items_per_warp;
+        storage_type_& storage_ = storage.get();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(offset + lane_id * ItemsPerThread + i)] = input[i];
+        }
+        ::rocprim::wave_barrier();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(offset + i * current_warp_size + lane_id)];
+        }
+    }
+    /// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
+                                 U (&output)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        warp_striped_to_blocked(input, output, storage);
+    }
+    /// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.warp_striped_to_blocked(items, items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
+                                 U (&output)[ItemsPerThread],
+                                 storage_type& storage)
+    {
+        constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
+        const unsigned int lane_id = ::rocprim::lane_id();
+        const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        const unsigned int current_warp_size = get_current_warp_size();
+        const unsigned int offset = warp_id * items_per_warp;
+        storage_type_& storage_ = storage.get();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(offset + i * current_warp_size + lane_id)] = input[i];
+        }
+        ::rocprim::wave_barrier();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(offset + lane_id * ItemsPerThread + i)];
+        }
+    }
+    /// \brief Scatters items to a blocked arrangement based on their ranks
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [out] ranks - array that has rank of data.
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void scatter_to_blocked(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            const Offset (&ranks)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        scatter_to_blocked(input, output, ranks, storage);
+    }
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void gather_from_striped(const T (&input)[ItemsPerThread],
+                                   U (&output)[ItemsPerThread],
+                                   const Offset (&ranks)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        gather_from_striped(input, output, ranks, storage);
+    }
+    /// \brief Scatters items to a blocked arrangement based on their ranks
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [out] ranks - array that has rank of data.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     int ranks[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.scatter_to_blocked(items, items, ranks, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void scatter_to_blocked(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            const Offset (&ranks)[ItemsPerThread],
+                            storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            storage_.buffer[index(rank)] = input[i];
+        }
+        ::rocprim::syncthreads();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
+        }
+    }
+    template <class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void gather_from_striped(const T (&input)[ItemsPerThread],
+                             U (&output)[ItemsPerThread],
+                             const Offset (&ranks)[ItemsPerThread],
+                             storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
+        }
+        ::rocprim::syncthreads();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            output[i] = storage_.buffer[index(rank)];
+        }
+    }
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [out] ranks - array that has rank of data.
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void scatter_to_striped(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            const Offset (&ranks)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        scatter_to_striped(input, output, ranks, storage);
+    }
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [out] ranks - array that has rank of data.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     int ranks[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.scatter_to_striped(items, items, ranks, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void scatter_to_striped(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            const Offset (&ranks)[ItemsPerThread],
+                            storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            storage_.buffer[rank] = input[i];
+        }
+        ::rocprim::syncthreads();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[i * BlockSize + flat_id];
+        }
+    }
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, guarded by rank.
+    ///
+    /// \par Overview
+    /// * Items with rank -1 are not scattered.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] ranks - array that has rank of data.
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
+                                    U (&output)[ItemsPerThread],
+                                    const Offset (&ranks)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        scatter_to_striped_guarded(input, output, ranks, storage);
+    }
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, guarded by rank, using temporary storage.
+    ///
+    /// \par Overview
+    /// * Items with rank -1 are not scattered.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] ranks - array that has rank of data.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     int ranks[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.scatter_to_striped_guarded(items, items, ranks, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
+                                    U (&output)[ItemsPerThread],
+                                    const Offset (&ranks)[ItemsPerThread],
+                                    storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            if(rank >= 0)
+            {
+                storage_.buffer[rank] = input[i];
+            }
+        }
+        ::rocprim::syncthreads();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[i * BlockSize + flat_id];
+        }
+    }
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, with a flag to denote validity.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    /// \tparam ValidFlag - [inferred] the validity flag type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] ranks - array that has rank of data.
+    /// \param [in] is_valid - array that has flags to denote validity.
+    template<class U, class Offset, class ValidFlag>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
+                                    U (&output)[ItemsPerThread],
+                                    const Offset (&ranks)[ItemsPerThread],
+                                    const ValidFlag (&is_valid)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        scatter_to_striped_flagged(input, output, ranks, is_valid, storage);
+    }
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, with a flag to denote validity, using temporary
+    /// storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    /// \tparam ValidFlag - [inferred] the validity flag type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] ranks - array that has rank of data.
+    /// \param [in] is_valid - array that has flags to denote validity.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     int ranks[8];
+    ///     int flags[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.scatter_to_striped_flagged(items, items, ranks, flags, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U, class Offset, class ValidFlag>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
+                                    U (&output)[ItemsPerThread],
+                                    const Offset (&ranks)[ItemsPerThread],
+                                    const ValidFlag (&is_valid)[ItemsPerThread],
+                                    storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            if(is_valid[i])
+            {
+                storage_.buffer[rank] = input[i];
+            }
+        }
+        ::rocprim::syncthreads();
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[i * BlockSize + flat_id];
+        }
+    }
+private:
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int get_current_warp_size() const
+    {
+        const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        return (warp_id == warps_no - 1)
+            ? (BlockSize % warp_size > 0 ? BlockSize % warp_size : warp_size)
+            : warp_size;
+    }
+    // Change index to minimize LDS bank conflicts if necessary
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int index(unsigned int n)
+    {
+        // Move every 32-bank wide "row" (32 banks * 4 bytes) by one item
+        return has_bank_conflicts ? (n + n / banks_no) : n;
+    }
+};
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group blockmodule
+#endif // ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
--- a/3rdparty/cub/rocprim/block/block_histogram.hpp
+++ b/3rdparty/cub/rocprim/block/block_histogram.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
+#define ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
+#include <type_traits>
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "detail/block_histogram_atomic.hpp"
+#include "detail/block_histogram_sort.hpp"
+BEGIN_ROCPRIM_NAMESPACE
+/// \addtogroup blockmodule
+/// @{
+/// \brief Available algorithms for block_histogram primitive.
+enum class block_histogram_algorithm
+{
+    /// Atomic addition is used to update bin count directly.
+    /// \par Performance Notes:
+    /// * Performance is dependent on hardware implementation of atomic addition.
+    /// * Performance may decrease for non-uniform random input distributions
+    /// where many concurrent updates may be made to the same bin counter.
+    using_atomic,
+    /// A two-phase operation is used:-
+    /// * Data is sorted using radix-sort.
+    /// * "Runs" of same-valued keys are detected using discontinuity; run-lengths
+    /// are bin counts.
+    /// \par Performance Notes:
+    /// * Performance is consistent regardless of sample bin distribution.
+    using_sort,
+    /// \brief Default block_histogram algorithm.
+    default_algorithm = using_atomic,
+};
+namespace detail
+{
+// Selector for block_histogram algorithm which gives block histogram implementation
+// type based on passed block_histogram_algorithm enum
+template<block_histogram_algorithm Algorithm>
+struct select_block_histogram_impl;
+template<>
+struct select_block_histogram_impl<block_histogram_algorithm::using_atomic>
+{
+    template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, unsigned int ItemsPerThread, unsigned int Bins>
+    using type = block_histogram_atomic<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
+};
+template<>
+struct select_block_histogram_impl<block_histogram_algorithm::using_sort>
+{
+    template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, unsigned int ItemsPerThread, unsigned int Bins>
+    using type = block_histogram_sort<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
+};
+} // end namespace detail
+/// \brief The block_histogram class is a block level parallel primitive which provides methods
+/// for constructing block-wide histograms from items partitioned across threads in a block.
+///
+/// \tparam T - the input/output type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam ItemsPerThread - the number of items to be processed by each thread.
+/// \tparam Bins - the number of bins within the histogram.
+/// \tparam Algorithm - selected histogram algorithm, block_histogram_algorithm::default_algorithm by default.
+///
+/// \par Overview
+/// * block_histogram has two alternative implementations: \p block_histogram_algorithm::using_atomic
+///   and block_histogram_algorithm::using_sort.
+///
+/// \par Examples
+/// \parblock
+/// In the examples histogram operation is performed on block of 192 threads, each provides
+/// one \p int value, result is returned using the same variable as for input.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize block_histogram for int, logical block of 192 threads,
+///     // 2 items per thread and a bin size of 192.
+///     using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
+///     // allocate storage in shared memory
+///     __shared__ block_histogram_int::storage_type storage;
+///     __shared__ int hist[192];
+///
+///     int value[2];
+///     ...
+///     // execute histogram
+///     block_histogram_int().histogram(
+///         value, // input
+///         hist, // output
+///         storage
+///     );
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int Bins,
+    block_histogram_algorithm Algorithm = block_histogram_algorithm::default_algorithm,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_histogram
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    : private detail::select_block_histogram_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>
+#endif
+{
+    using base_type = typename detail::select_block_histogram_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    using storage_type = typename base_type::storage_type;
+    /// \brief Initialize histogram counters to zero.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [out] hist - histogram bin count.
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void init_histogram(Counter hist[Bins])
+    {
+        const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        ROCPRIM_UNROLL
+        for(unsigned int offset = 0; offset < Bins; offset += BlockSize)
+        {
+            const unsigned int offset_tid = offset + flat_tid;
+            if(offset_tid < Bins)
+            {
+                hist[offset_tid] = Counter();
+            }
+        }
+    }
+    /// \brief Update an existing block-wide histogram. Each thread composites an array of
+    /// input elements.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] hist - histogram bin count.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples histogram operation is performed on block of 192 threads, each provides
+    /// one \p int value, result is returned using the same variable as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_histogram for int, logical block of 192 threads,
+    ///     // 2 items per thread and a bin size of 192.
+    ///     using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_histogram_int::storage_type storage;
+    ///     __shared__ int hist[192];
+    ///
+    ///     int value[2];
+    ///     ...
+    ///     // initialize histogram
+    ///     block_histogram_int().init_histogram(
+    ///         hist // output
+    ///     );
+    ///
+    ///     rocprim::syncthreads();
+    ///
+    ///     // update histogram
+    ///     block_histogram_int().composite(
+    ///         value, // input
+    ///         hist, // output
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    /// \endparblock
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void composite(T (&input)[ItemsPerThread],
+                   Counter hist[Bins],
+                   storage_type& storage)
+    {
+        base_type::composite(input, hist, storage);
+    }
+    /// \overload
+    /// \brief Update an existing block-wide histogram. Each thread composites an array of
+    /// input elements.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] hist - histogram bin count.
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void composite(T (&input)[ItemsPerThread],
+                   Counter hist[Bins])
+    {
+        base_type::composite(input, hist);
+    }
+    /// \brief Construct a new block-wide histogram. Each thread contributes an array of
+    /// input elements.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] hist - histogram bin count.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples histogram operation is performed on block of 192 threads, each provides
+    /// one \p int value, result is returned using the same variable as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_histogram for int, logical block of 192 threads,
+    ///     // 2 items per thread and a bin size of 192.
+    ///     using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_histogram_int::storage_type storage;
+    ///     __shared__ int hist[192];
+    ///
+    ///     int value[2];
+    ///     ...
+    ///     // execute histogram
+    ///     block_histogram_int().histogram(
+    ///         value, // input
+    ///         hist, // output
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    /// \endparblock
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void histogram(T (&input)[ItemsPerThread],
+                   Counter hist[Bins],
+                   storage_type& storage)
+    {
+        init_histogram(hist);
+        ::rocprim::syncthreads();
+        composite(input, hist, storage);
+    }
+    /// \overload
+    /// \brief Construct a new block-wide histogram. Each thread contributes an array of
+    /// input elements.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] hist - histogram bin count.
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void histogram(T (&input)[ItemsPerThread],
+                   Counter hist[Bins])
+    {
+        init_histogram(hist);
+        ::rocprim::syncthreads();
+        composite(input, hist);
+    }
+};
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group blockmodule
+#endif // ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
--- a/3rdparty/cub/rocprim/block/block_load.hpp
+++ b/3rdparty/cub/rocprim/block/block_load.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
+#define ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+#include "block_load_func.hpp"
+#include "block_exchange.hpp"
+/// \addtogroup blockmodule
+/// @{
+BEGIN_ROCPRIM_NAMESPACE
+/// \brief \p block_load_method enumerates the methods available to load data
+/// from continuous memory into a blocked arrangement of items across the thread block
+enum class block_load_method
+{
+    /// Data from continuous memory is loaded into a blocked arrangement of items.
+    /// \par Performance Notes:
+    /// * Performance decreases with increasing number of items per thread (stride
+    /// between reads), because of reduced memory coalescing.
+    block_load_direct,
+    /// A striped arrangement of data is read directly from memory.
+    block_load_striped,
+    /// Data from continuous memory is loaded into a blocked arrangement of items
+    /// using vectorization as an optimization.
+    /// \par Performance Notes:
+    /// * Performance remains high due to increased memory coalescing, provided that
+    /// vectorization requirements are fulfilled. Otherwise, performance will default
+    /// to \p block_load_direct.
+    /// \par Requirements:
+    /// * The input offset (\p block_input) must be quad-item aligned.
+    /// * The following conditions will prevent vectorization and switch to default
+    /// \p block_load_direct:
+    ///   * \p ItemsPerThread is odd.
+    ///   * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+    /// int4, etc.
+    block_load_vectorize,
+    /// A striped arrangement of data from continuous memory is locally transposed
+    /// into a blocked arrangement of items.
+    /// \par Performance Notes:
+    /// * Performance remains high due to increased memory coalescing, regardless of the
+    /// number of items per thread.
+    /// * Performance may be better compared to \p block_load_direct and
+    /// \p block_load_vectorize due to reordering on local memory.
+    block_load_transpose,
+    /// A warp-striped arrangement of data from continuous memory is locally transposed
+    /// into a blocked arrangement of items.
+    /// \par Requirements:
+    /// * The number of threads in the block must be a multiple of the size of hardware warp.
+    /// \par Performance Notes:
+    /// * Performance remains high due to increased memory coalescing, regardless of the
+    /// number of items per thread.
+    /// * Performance may be better compared to \p block_load_direct and
+    /// \p block_load_vectorize due to reordering on local memory.
+    block_load_warp_transpose,
+    /// Defaults to \p block_load_direct
+    default_method = block_load_direct
+};
+/// \brief The \p block_load class is a block level parallel primitive which provides methods
+/// for loading data from continuous memory into a blocked arrangement of items across the thread
+/// block.
+///
+/// \tparam T - the input/output type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam ItemsPerThread - the number of items to be processed by
+/// each thread.
+/// \tparam Method - the method to load data.
+///
+/// \par Overview
+/// * The \p block_load class has a number of different methods to load data:
+///   * [block_load_direct](\ref ::block_load_method::block_load_direct)
+///   * [block_load_striped](\ref ::block_load_method::block_load_striped)
+///   * [block_load_vectorize](\ref ::block_load_method::block_load_vectorize)
+///   * [block_load_transpose](\ref ::block_load_method::block_load_transpose)
+///   * [block_load_warp_transpose](\ref ::block_load_method::block_load_warp_transpose)
+///
+/// \par Example:
+/// \parblock
+/// In the examples load operation is performed on block of 128 threads, using type
+/// \p int and 8 items per thread.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(int * input, ...)
+/// {
+///     const int offset = blockIdx.x * 128 * 8;
+///     int items[8];
+///     rocprim::block_load<int, 128, 8, load_method> blockload;
+///     blockload.load(input + offset, items);
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    block_load_method Method = block_load_method::block_load_direct,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_load
+{
+private:
+    using storage_type_ = typename ::rocprim::detail::empty_storage_type;
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords \p __shared__. It can be aliased to
+    /// an externally allocated memory, or be a part of a union with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = typename ::rocprim::detail::empty_storage_type;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items);
+    }
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, which is guarded by range \p valid.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] valid - maximum range of valid numbers to load.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items, valid);
+    }
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, which is guarded by range with a fall-back value for out-of-bound
+    /// elements.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    /// \tparam Default - [inferred] The data type of the default value.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] valid - maximum range of valid numbers to load.
+    /// \param [in] out_of_bounds - default value assigned to out-of-bound items.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items, valid,
+                                  out_of_bounds);
+    }
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, using temporary storage.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] storage - temporary storage for inputs.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     int items[8];
+    ///     using block_load_int = rocprim::block_load<int, 128, 8>;
+    ///     block_load_int bload;
+    ///     __shared__ typename block_load_int::storage_type storage;
+    ///     bload.load(..., items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items);
+    }
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, which is guarded by range \p valid, using temporary storage.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] valid - maximum range of valid numbers to load.
+    /// \param [in] storage - temporary storage for inputs.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     int items[8];
+    ///     using block_load_int = rocprim::block_load<int, 128, 8>;
+    ///     block_load_int bload;
+    ///     tile_static typename block_load_int::storage_type storage;
+    ///     bload.load(..., items, valid, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items, valid);
+    }
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, which is guarded by range with a fall-back value for out-of-bound
+    /// elements, using temporary storage.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    /// \tparam Default - [inferred] The data type of the default value.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] valid - maximum range of valid numbers to load.
+    /// \param [in] out_of_bounds - default value assigned to out-of-bound items.
+    /// \param [in] storage - temporary storage for inputs.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     int items[8];
+    ///     using block_load_int = rocprim::block_load<int, 128, 8>;
+    ///     block_load_int bload;
+    ///     __shared__ typename block_load_int::storage_type storage;
+    ///     bload.load(..., items, valid, out_of_bounds, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items, valid, out_of_bounds);
+    }
+};
+/// @}
+// end of group blockmodule
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+    >
+class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_striped, BlockSizeY, BlockSizeZ>
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+private:
+    using storage_type_ = typename ::rocprim::detail::empty_storage_type;
+public:
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = typename ::rocprim::detail::empty_storage_type;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+    template<class InputIterator>
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+    }
+    template<
+        class InputIterator,
+        class Default
+        >
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
+                                             out_of_bounds);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+    }
+    template<
+        class InputIterator,
+        class Default
+        >
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
+                                             out_of_bounds);
+    }
+};
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+>
+class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_vectorize, BlockSizeY, BlockSizeZ>
+{
+private:
+    using storage_type_ = typename ::rocprim::detail::empty_storage_type;
+public:
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = typename ::rocprim::detail::empty_storage_type;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(T* block_input,
+              T (&_items)[ItemsPerThread])
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked_vectorized(flat_id, block_input, _items);
+    }
+    template<class InputIterator, class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              U (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items, valid);
+    }
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items, valid,
+                                  out_of_bounds);
+    }
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(T* block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        (void) storage;
+        load(block_input, items);
+    }
+    template<class InputIterator, class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              U (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items, valid);
+    }
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items, valid, out_of_bounds);
+    }
+};
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+>
+class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_transpose, BlockSizeY, BlockSizeZ>
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+private:
+    using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;
+public:
+    using storage_type = typename block_exchange_type::storage_type;
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
+                                             out_of_bounds);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
+                                             out_of_bounds);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+};
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+>
+class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_warp_transpose, BlockSizeY, BlockSizeZ>
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+private:
+    using block_exchange_type = block_exchange<T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
+public:
+    static_assert(BlockSize % ::rocprim::device_warp_size() == 0,
+                 "BlockSize must be a multiple of hardware warpsize");
+    using storage_type = typename block_exchange_type::storage_type;
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items, valid);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items, valid,
+                                       out_of_bounds);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items, valid);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items, valid,
+                                       out_of_bounds);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+};
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+END_ROCPRIM_NAMESPACE
+#endif // ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
--- a/3rdparty/cub/rocprim/block/block_load_func.hpp
+++ b/3rdparty/cub/rocprim/block/block_load_func.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
+#define ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+BEGIN_ROCPRIM_NAMESPACE
+/// \addtogroup blockmodule
+/// @{
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+template<
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_blocked(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread])
+{
+    unsigned int offset = flat_id * ItemsPerThread;
+    InputIterator thread_iter = block_input + offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = thread_iter[item];
+    }
+}
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block, which is guarded by range \p valid.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+template<
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_blocked(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread],
+                               unsigned int valid)
+{
+    unsigned int offset = flat_id * ItemsPerThread;
+    InputIterator thread_iter = block_input + offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        if (item + offset < valid)
+        {
+            items[item] = thread_iter[item];
+        }
+    }
+}
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block, which is guarded by range with a fall-back value
+/// for out-of-bound elements.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+/// \tparam Default - [inferred] The data type of the default value
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+/// \param out_of_bounds - default value assigned to out-of-bound items
+template<
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread,
+    class Default
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_blocked(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread],
+                               unsigned int valid,
+                               Default out_of_bounds)
+{
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = static_cast<T>(out_of_bounds);
+    }
+    // TODO: Consider using std::fill for HIP-CPU, as uses memset() where appropriate
+    block_load_direct_blocked(flat_id, block_input, items, valid);
+}
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// The input offset (\p block_input + offset) must be quad-item aligned.
+///
+/// The following conditions will prevent vectorization and switch to default
+/// block_load_direct_blocked:
+/// * \p ItemsPerThread is odd.
+/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+/// int4, etc.
+///
+/// \tparam T - [inferred] the input data type
+/// \tparam U - [inferred] the output data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// The type \p T must be such that it can be implicitly converted to \p U.
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+template<
+    class T,
+    class U,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto
+block_load_direct_blocked_vectorized(unsigned int flat_id,
+                                     T* block_input,
+                                     U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type
+{
+    typedef typename detail::match_vector_type<T, ItemsPerThread>::type vector_type;
+    constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type);
+    vector_type vector_items[vectors_per_thread];
+    const vector_type* vector_ptr = reinterpret_cast<const vector_type*>(block_input) +
+        (flat_id * vectors_per_thread);
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < vectors_per_thread; item++)
+    {
+        vector_items[item] = *(vector_ptr + item);
+    }
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = *(reinterpret_cast<T*>(vector_items) + item);
+    }
+}
+template<
+    class T,
+    class U,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto
+block_load_direct_blocked_vectorized(unsigned int flat_id,
+                                     T* block_input,
+                                     U (&items)[ItemsPerThread]) -> typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>::value>::type
+{
+    block_load_direct_blocked(flat_id, block_input, items);
+}
+/// \brief Loads data from continuous memory into a striped arrangement of items
+/// across the thread block.
+///
+/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam BlockSize - the number of threads in a block
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+template<
+    unsigned int BlockSize,
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_striped(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread])
+{
+    InputIterator thread_iter = block_input + flat_id;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = thread_iter[item * BlockSize];
+    }
+}
+/// \brief Loads data from continuous memory into a striped arrangement of items
+/// across the thread block, which is guarded by range \p valid.
+///
+/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam BlockSize - the number of threads in a block
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+template<
+    unsigned int BlockSize,
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_striped(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread],
+                               unsigned int valid)
+{
+    InputIterator thread_iter = block_input + flat_id;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        unsigned int offset = item * BlockSize;
+        if (flat_id + offset < valid)
+        {
+            items[item] = thread_iter[offset];
+        }
+    }
+}
+/// \brief Loads data from continuous memory into a striped arrangement of items
+/// across the thread block, which is guarded by range with a fall-back value
+/// for out-of-bound elements.
+///
+/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam BlockSize - the number of threads in a block
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+/// \tparam Default - [inferred] The data type of the default value
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+/// \param out_of_bounds - default value assigned to out-of-bound items
+template<
+    unsigned int BlockSize,
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread,
+    class Default
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_striped(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread],
+                               unsigned int valid,
+                               Default out_of_bounds)
+{
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = out_of_bounds;
+    }
+    block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+}
+/// \brief Loads data from continuous memory into a warp-striped arrangement of items
+/// across the thread block.
+///
+/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// * The number of threads in the block must be a multiple of \p WarpSize.
+/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
+/// * \p WarpSize must be a power of two and equal or less than the size of
+///   hardware warp.
+/// * Using \p WarpSize smaller than hardware warpsize could result in lower
+///   performance.
+///
+/// \tparam WarpSize - [optional] the number of threads in a warp
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+template<
+    unsigned int WarpSize = device_warp_size(),
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_warp_striped(unsigned int flat_id,
+                                    InputIterator block_input,
+                                    T (&items)[ItemsPerThread])
+{
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
+                 "WarpSize must be a power of two and equal or less"
+                 "than the size of hardware warp.");
+    unsigned int thread_id = detail::logical_lane_id<WarpSize>();
+    unsigned int warp_id = flat_id / WarpSize;
+    unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
+    InputIterator thread_iter = block_input + thread_id + warp_offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = thread_iter[item * WarpSize];
+    }
+}
+/// \brief Loads data from continuous memory into a warp-striped arrangement of items
+/// across the thread block, which is guarded by range \p valid.
+///
+/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// * The number of threads in the block must be a multiple of \p WarpSize.
+/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
+/// * \p WarpSize must be a power of two and equal or less than the size of
+///   hardware warp.
+/// * Using \p WarpSize smaller than hardware warpsize could result in lower
+///   performance.
+///
+/// \tparam WarpSize - [optional] the number of threads in a warp
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+template<
+    unsigned int WarpSize = device_warp_size(),
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_warp_striped(unsigned int flat_id,
+                                    InputIterator block_input,
+                                    T (&items)[ItemsPerThread],
+                                    unsigned int valid)
+{
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
+                 "WarpSize must be a power of two and equal or less"
+                 "than the size of hardware warp.");
+    unsigned int thread_id = detail::logical_lane_id<WarpSize>();
+    unsigned int warp_id = flat_id / WarpSize;
+    unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
+    InputIterator thread_iter = block_input + thread_id + warp_offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        unsigned int offset = item * WarpSize;
+        if (warp_offset + thread_id + offset < valid)
+        {
+            items[item] = thread_iter[offset];
+        }
+    }
+}
+/// \brief Loads data from continuous memory into a warp-striped arrangement of items
+/// across the thread block, which is guarded by range with a fall-back value
+/// for out-of-bound elements.
+///
+/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// * The number of threads in the block must be a multiple of \p WarpSize.
+/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
+/// * \p WarpSize must be a power of two and equal or less than the size of
+///   hardware warp.
+/// * Using \p WarpSize smaller than hardware warpsize could result in lower
+///   performance.
+///
+/// \tparam WarpSize - [optional] the number of threads in a warp
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+/// \tparam Default - [inferred] The data type of the default value
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+/// \param out_of_bounds - default value assigned to out-of-bound items
+template<
+    unsigned int WarpSize = device_warp_size(),
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread,
+    class Default
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_warp_striped(unsigned int flat_id,
+                                    InputIterator block_input,
+                                    T (&items)[ItemsPerThread],
+                                    unsigned int valid,
+                                    Default out_of_bounds)
+{
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
+                 "WarpSize must be a power of two and equal or less"
+                 "than the size of hardware warp.");
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = out_of_bounds;
+    }
+    block_load_direct_warp_striped<WarpSize>(flat_id, block_input, items, valid);
+}
+END_ROCPRIM_NAMESPACE
+/// @}
+// end of group blockmodule
+#endif // ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_