添加dtk中的cub头文件

f8a481f8 · zhouxiang · 7b7c64c5 · f8a481f8 · f8a481f8 · f8a481f8
Commit f8a481f8 authored Oct 13, 2023 by zhouxiang
20 changed files
--- a/3rdparty/cub/iterator/constant_input_iterator.cuh
+++ b/3rdparty/cub/iterator/constant_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.hpp"
+
+#include <cub/rocprim/iterator/constant_iterator.hpp>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+BEGIN_HIPCUB_NAMESPACE
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template<
+    typename ValueType,
+    typename OffsetT = std::ptrdiff_t
+>
+using ConstantInputIterator = ::rocprim::constant_iterator<ValueType, OffsetT>;
+
+#endif
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/counting_input_iterator.cuh
+++ b/3rdparty/cub/iterator/counting_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.hpp"
+
+#include <cub/rocprim/iterator/counting_iterator.hpp>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+BEGIN_HIPCUB_NAMESPACE
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template<
+    typename ValueType,
+    typename OffsetT = std::ptrdiff_t
+>
+using CountingInputIterator = ::rocprim::counting_iterator<ValueType, OffsetT>;
+
+#endif
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/discard_output_iterator.cuh
+++ b/3rdparty/cub/iterator/discard_output_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.hpp"
+
+BEGIN_HIPCUB_NAMESPACE
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef DiscardOutputIterator   self_type;              ///< My own type
+    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                    value_type;             ///< The type of the element the iterator can point to
+    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    OffsetT offset;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ DiscardOutputIterator(
+        OffsetT offset = 0)     ///< Base offset
+    :
+        offset(offset)
+    {}
+
+    /**
+    * @typedef self_type
+    * @brief Postfix increment
+    */
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /**
+    * @typedef self_type
+    * @brief Postfix increment
+    */
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /**
+    * @typedef self_type
+    * @brief Indirection
+    */
+    __host__ __device__ __forceinline__ self_type& operator*()
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /**
+    * @typedef self_type
+    * @brief Addition
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(offset + n);
+        return retval;
+    }
+
+    /**
+    * @typedef self_type
+    * @brief Addition assignment
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /**
+    * @typedef self_type
+    * @brief Subtraction assignment
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(offset - n);
+        return retval;
+    }
+
+    /**
+    * @typedef self_type
+    * @brief Subtraction assignment
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /**
+    * @typedef self_type
+    * @brief Distance
+    */
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /**
+    * @typedef self_type
+    * @brief Array subscript
+    */
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator[](Distance)
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return;
+    }
+
+    /// Assignment to anything else (no-op)
+    template<typename T>
+    __host__ __device__ __forceinline__ void operator=(T const&)
+    {}
+
+    /// Cast to void* operator
+    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
+
+    /**
+    * @typedef self_type
+    * @brief Equal to
+    */
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset);
+    }
+
+    /**
+    * @typedef self_type
+    * @brief Not equal to
+    */
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset);
+    }
+
+    /**
+    * @typedef self_type
+    * @brief ostream operator
+    */
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.offset << "]";
+        return os;
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/tex_obj_input_iterator.cuh
+++ b/3rdparty/cub/iterator/tex_obj_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.hpp"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+#include <cub/rocprim/iterator/texture_cache_iterator.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+template<
+    typename T,
+    typename OffsetT = std::ptrdiff_t
+>
+class TexObjInputIterator : public ::rocprim::texture_cache_iterator<T, OffsetT>
+{
+    public:
+    template<class Qualified>
+    inline
+    cudaError_t BindTexture(Qualified* ptr,
+                           size_t bytes = size_t(-1),
+                           size_t texture_offset = 0)
+    {
+        return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::bind_texture(ptr, bytes, texture_offset);
+    }
+
+    inline cudaError_t UnbindTexture()
+    {
+        return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::unbind_texture();
+    }
+
+    HIPCUB_HOST_DEVICE inline
+    ~TexObjInputIterator() = default;
+
+    HIPCUB_HOST_DEVICE inline
+    TexObjInputIterator() : ::rocprim::texture_cache_iterator<T, OffsetT>()
+    {
+    }
+
+    HIPCUB_HOST_DEVICE inline
+    TexObjInputIterator(const ::rocprim::texture_cache_iterator<T, OffsetT> other)
+        : ::rocprim::texture_cache_iterator<T, OffsetT>(other)
+    {
+    }
+
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/tex_ref_input_iterator.cuh
+++ b/3rdparty/cub/iterator/tex_ref_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.hpp"
+
+#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+#include <rocprim/iterator/texture_cache_iterator.hpp>
+
+BEGIN_HIPCUB_NAMESPACE
+
+template<
+    typename T,
+    int UNIQUE_ID, // Unused parameter for compatibility with original definition in cub
+    typename OffsetT = std::ptrdiff_t
+>
+class TexRefInputIterator : public ::rocprim::texture_cache_iterator<T, OffsetT>
+{
+    public:
+    template<class Qualified>
+    inline
+    cudaError_t BindTexture(Qualified* ptr,
+                           size_t bytes = size_t(-1),
+                           size_t texture_offset = 0)
+    {
+        return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::bind_texture(ptr, bytes, texture_offset);
+    }
+
+    inline cudaError_t UnbindTexture()
+    {
+        return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::unbind_texture();
+    }
+
+    HIPCUB_HOST_DEVICE inline
+    ~TexRefInputIterator() = default;
+
+    HIPCUB_HOST_DEVICE inline
+    TexRefInputIterator() : ::rocprim::texture_cache_iterator<T, OffsetT>()
+    {
+    }
+
+    HIPCUB_HOST_DEVICE inline
+    TexRefInputIterator(const ::rocprim::texture_cache_iterator<T, OffsetT> other)
+        : ::rocprim::texture_cache_iterator<T, OffsetT>(other)
+    {
+    }
+
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/iterator/transform_input_iterator.cuh
+++ b/3rdparty/cub/iterator/transform_input_iterator.cuh
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
+#define HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.hpp"
+
+#include <cub/rocprim/iterator/transform_iterator.hpp>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+BEGIN_HIPCUB_NAMESPACE
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template<
+    typename ValueType,
+    typename ConversionOp,
+    typename InputIteratorT,
+    typename OffsetT = std::ptrdiff_t // ignored
+>
+using TransformInputIterator = ::rocprim::transform_iterator<InputIteratorT, ConversionOp, ValueType>;
+
+#endif
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
--- a/3rdparty/cub/rocprim/block/block_adjacent_difference.hpp
+++ b/3rdparty/cub/rocprim/block/block_adjacent_difference.hpp
+/******************************************************************************
+* Copyright (c) 2011, Duane Merrill.  All rights reserved.
+* Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+* Modifications Copyright (c) 2022, Advanced Micro Devices, Inc.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+#ifndef ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
+#define ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
+
+
+#include "detail/block_adjacent_difference_impl.hpp"
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief The \p block_adjacent_difference class is a block level parallel primitive which provides
+/// methods for applying binary functions for pairs of consecutive items partition across a thread
+/// block.
+///
+/// \tparam T - the input type.
+/// \tparam BlockSize - the number of threads in a block.
+///
+/// \par Overview
+/// * There are two types of flags:
+///   * Head flags.
+///   * Tail flags.
+/// * The above flags are used to differentiate items from their predecessors or successors.
+/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
+/// segmented reduction/scan.
+///
+/// \par Examples
+/// \parblock
+/// In the examples discontinuity operation is performed on block of 128 threads, using type
+/// \p int.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize discontinuity for int and a block of 128 threads
+///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+///     // allocate storage in shared memory
+///     __shared__ block_adjacent_difference_int::storage_type storage;
+///
+///     // segment of consecutive items to be used
+///     int input[8];
+///     ...
+///     int head_flags[8];
+///     block_adjacent_difference_int b_discontinuity;
+///     using flag_op_type = typename rocprim::greater<int>;
+///     b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_adjacent_difference
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
+    : private detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+{
+    using base_type = detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
+
+    static constexpr unsigned BlockSize = base_type::BlockSize;
+    // Struct used for creating a raw_storage object for this primitive's temporary storage.
+    struct storage_type_
+    {
+        typename base_type::storage_type left;
+        typename base_type::storage_type right;
+    };
+
+public:
+
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = detail::raw_storage<storage_type_>;
+    #else
+    using storage_type = storage_type_;
+    #endif
+
+    /// \brief Tags \p head_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the first item has no reference and is always
+    /// flagged.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_left() or block_discontinuity::flag_heads() instead.
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int head_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_left or block_discontinuity.flag_heads instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, head_flags, flag_op, input[0] /* predecessor */, storage.get().left);
+    }
+
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_left() or block_discontinuity::flag_heads() instead.
+    /// This overload does not take a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_left or block_discontinuity.flag_heads instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads(head_flags, input, flag_op, storage);
+    }
+
+    /// \brief Tags \p head_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the first item of the first thread is compared against
+    /// a \p tile_predecessor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_left() or block_discontinuity::flag_heads() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
+    ///                                storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_left or block_discontinuity.flag_heads instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    T tile_predecessor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+    }
+
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_left() or block_discontinuity::flag_heads() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_left or block_discontinuity.flag_heads instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    T tile_predecessor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads(head_flags, tile_predecessor_item, input, flag_op, storage);
+    }
+
+    /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the last item has no reference and is always
+    /// flagged.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_right() or block_discontinuity::flag_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_right or block_discontinuity.flag_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags       = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_successor = false;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, tail_flags, flag_op, input[0] /* successor */, storage.get().right);
+    }
+
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_right() or block_discontinuity::flag_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_right or block_discontinuity.flag_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_tails(tail_flags, input, flag_op, storage);
+    }
+
+    /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the last item of the last thread is compared against
+    /// a \p tile_successor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_right() or block_discontinuity::flag_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
+    ///                                storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_right or block_discontinuity.flag_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    T tile_successor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags       = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_successor = true;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use subtract_right() or block_discontinuity::flag_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use subtract_right or block_discontinuity.flag_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    T tile_successor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_tails(tail_flags, tile_successor_item, input, flag_op, storage);
+    }
+
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
+    ///                                          flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+        static constexpr auto with_successor   = false;
+
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
+    }
+
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tail_flags, input, flag_op, storage);
+    }
+
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the last item of the
+    /// last thread is compared against a \p tile_successor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+        static constexpr auto with_successor   = true;
+
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, flag_op, storage);
+    }
+
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the first item of the
+    /// first thread is compared against a \p tile_predecessor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+        static constexpr auto with_successor   = false;
+
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
+    }
+
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage);
+    }
+
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the first and last items of
+    /// the first and last thread is compared against a \p tile_predecessor_item and
+    /// a \p tile_successor_item.
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_adjacent_difference_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_predecessor_item = 0;
+    ///     int tile_successor_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_predecessor_item = ...
+    ///         tile_successor_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_adjacent_difference_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
+    ///                                          tail_flags, tile_successor_item,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+        static constexpr auto with_successor   = true;
+
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+
+    /// \overload
+    /// \deprecated The flags API of block_adjacent_difference is deprecated,
+    /// use block_discontinuity::flag_heads_and_tails() instead.
+    ///
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    [[deprecated("The flags API of block_adjacent_difference is deprecated."
+                 "Use block_discontinuity.flag_heads_and_tails instead.")]]
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(
+            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+            input, flag_op, storage
+        );
+    }
+
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the left item.
+    ///
+    /// The first item in the first thread is copied from the input then for the rest the following
+    /// code applies.
+    /// \code
+    /// // For each i in [1, block_size * ItemsPerThread) across threads in a block
+    /// output[i] = op(input[i], input[i-1]);
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param storage reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left(const T (&input)[ItemsPerThread],
+                                                     Output (&output)[ItemsPerThread],
+                                                     const BinaryFunction op,
+                                                     storage_type&        storage)
+    {
+        static constexpr auto as_flags         = false;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, output, op, input[0] /* predecessor */, storage.get().left);
+    }
+
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the left item, with an explicit item before 
+    /// the tile.
+    ///
+    /// \code
+    /// // For the first item on the first thread use the tile predecessor
+    /// output[0] = op(input[0], tile_predecessor)
+    /// // For other items, i in [1, block_size * ItemsPerThread) across threads in a block
+    /// output[i] = op(input[i], input[i-1]);
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] tile_predecessor - the item before the tile, will be used as the input 
+    /// of the first application of `op`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left(const T (&input)[ItemsPerThread],
+                                                     Output (&output)[ItemsPerThread],
+                                                     const BinaryFunction op,
+                                                     const T              tile_predecessor,
+                                                     storage_type&        storage)
+    {
+        static constexpr auto as_flags         = false;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, output, op, tile_predecessor, storage.get().left);
+    }
+
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the left item, in a partial tile.
+    ///
+    /// \code
+    /// output[0] = input[0]
+    /// // For each item i in [1, valid_items) across threads in a block
+    /// output[i] = op(input[i], input[i-1]);
+    /// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
+    /// output[i] = input[i]
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] valid_items - number of items in the block which are considered "valid" and will
+    /// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left_partial(const T (&input)[ItemsPerThread],
+                                                             Output (&output)[ItemsPerThread],
+                                                             const BinaryFunction op,
+                                                             const unsigned int   valid_items,
+                                                             storage_type&        storage)
+    {
+        static constexpr auto as_flags         = false;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = false;
+
+        base_type::template apply_left_partial<as_flags, reversed, with_predecessor>(
+            input, output, op, input[0] /* predecessor */, valid_items, storage.get().left);
+    }
+
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the left item, in a partial tile with a
+    /// predecessor.
+    ///
+    /// This combines subtract_left_partial() with a tile predecessor.
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] tile_predecessor - the item before the tile, will be used as the input 
+    /// of the first application of `op`
+    /// \param [in] valid_items - number of items in the block which are considered "valid" and will
+    /// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left_partial(const T (&input)[ItemsPerThread],
+                                                             Output (&output)[ItemsPerThread],
+                                                             const BinaryFunction op,
+                                                             const T              tile_predecessor,
+                                                             const unsigned int   valid_items,
+                                                             storage_type&        storage)
+    {
+        static constexpr auto as_flags         = false;
+        static constexpr auto reversed         = true;
+        static constexpr auto with_predecessor = true;
+
+        base_type::template apply_left_partial<as_flags, reversed, with_predecessor>(
+            input, output, op, tile_predecessor, valid_items, storage.get().left);
+    }
+
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the right item.
+    ///
+    /// The last item in the last thread is copied from the input then for the rest the following
+    /// code applies.
+    /// \code
+    /// // For each i in [0, block_size * ItemsPerThread - 1) across threads in a block
+    /// output[i] = op(input[i], input[i+1]);
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right(const T (&input)[ItemsPerThread],
+                                                      Output (&output)[ItemsPerThread],
+                                                      const BinaryFunction op,
+                                                      storage_type&        storage)
+    {
+        static constexpr auto as_flags       = false;
+        static constexpr auto reversed       = false;
+        static constexpr auto with_successor = false;
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, output, op, input[0] /* successor */, storage.get().right);
+    }
+
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the right item, with an explicit item after 
+    /// the tile.
+    ///
+    /// \code
+    /// // For each items i in [0, block_size * ItemsPerThread - 1) across threads in a block
+    /// output[i] = op(input[i], input[i+1]);
+    /// // For the last item on the last thread use the tile successor
+    /// output[block_size * ItemsPerThread - 1] =
+    ///      op(input[block_size * ItemsPerThread - 1], tile_successor)
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] tile_successor - the item after the tile, will be used as the input 
+    /// of the last application of `op`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right(const T (&input)[ItemsPerThread],
+                                                      Output (&output)[ItemsPerThread],
+                                                      const BinaryFunction op,
+                                                      const T              tile_successor,
+                                                      storage_type&        storage)
+    {
+        static constexpr auto as_flags       = false;
+        static constexpr auto reversed       = false;
+        static constexpr auto with_successor = true;
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, output, op, tile_successor, storage.get().right);
+    }
+
+    /// \brief Apply a function to each consecutive pair of elements partitioned across threads in
+    /// the block and write the output to the position of the right item, in a partial tile.
+    ///
+    /// \code
+    /// // For each item i in [0, valid_items) across threads in a block
+    /// output[i] = op(input[i], input[i + 1]);
+    /// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
+    /// output[i] = input[i]
+    /// \endcode
+    ///
+    /// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
+    /// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
+    /// \tparam BinaryFunction - [inferred] the type of the function to apply
+    /// \param [in] input - array that data is loaded from partitioned across the threads in the block
+    /// \param [out] output - array where the result of function application will be written to
+    /// \param [in] op - binary function applied to the items.
+    /// The signature of the function should be equivalent to the following:
+    /// `bool f(const T &a, const T &b)` The signature does not need to have
+    /// `const &` but the function object must not modify the objects passed to it.
+    /// \param [in] valid_items - number of items in the block which are considered "valid" and will
+    /// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
+    /// \param storage - reference to a temporary storage object of type #storage_type
+    /// \par Storage reuse
+    /// Synchronization barrier should be placed before `storage` is reused
+    /// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
+    template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right_partial(const T (&input)[ItemsPerThread],
+                                                              Output (&output)[ItemsPerThread],
+                                                              const BinaryFunction op,
+                                                              const unsigned int   valid_items,
+                                                              storage_type&        storage)
+    {
+        static constexpr auto as_flags = false;
+        static constexpr auto reversed = false;
+
+        base_type::template apply_right_partial<as_flags, reversed>(
+            input, output, op, valid_items, storage.get().right);
+    }
+};
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
--- a/3rdparty/cub/rocprim/block/block_discontinuity.hpp
+++ b/3rdparty/cub/rocprim/block/block_discontinuity.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
+#define ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
+
+
+#include "detail/block_adjacent_difference_impl.hpp"
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief The \p block_discontinuity class is a block level parallel primitive which provides
+/// methods for flagging items that are discontinued within an ordered set of items across
+/// threads in a block.
+///
+/// \tparam T - the input type.
+/// \tparam BlockSize - the number of threads in a block.
+///
+/// \par Overview
+/// * There are two types of flags:
+///   * Head flags.
+///   * Tail flags.
+/// * The above flags are used to differentiate items from their predecessors or successors.
+/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
+/// segmented reduction/scan.
+///
+/// \par Examples
+/// \parblock
+/// In the examples discontinuity operation is performed on block of 128 threads, using type
+/// \p int.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize discontinuity for int and a block of 128 threads
+///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+///     // allocate storage in shared memory
+///     __shared__ block_discontinuity_int::storage_type storage;
+///
+///     // segment of consecutive items to be used
+///     int input[8];
+///     ...
+///     int head_flags[8];
+///     block_discontinuity_int b_discontinuity;
+///     using flag_op_type = typename rocprim::greater<int>;
+///     b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_discontinuity
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
+    : private detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+{
+    using base_type = detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
+
+    static constexpr unsigned BlockSize = base_type::BlockSize;
+    // Struct used for creating a raw_storage object for this primitive's temporary storage.
+    struct storage_type_
+    {
+        typename base_type::storage_type left;
+        typename base_type::storage_type right;
+    };
+
+public:
+
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = detail::raw_storage<storage_type_>;
+    #else
+    using storage_type = storage_type_;
+    #endif
+
+    /// \brief Tags \p head_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the first item has no reference and is always
+    /// flagged.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int head_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = false;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, head_flags, flag_op, input[0] /* predecessor */, storage.get().left);
+    }
+
+    /// \overload
+    /// This overload does not take a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads(head_flags, input, flag_op, storage);
+    }
+
+    /// \brief Tags \p head_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the first item of the first thread is compared against
+    /// a \p tile_predecessor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
+    ///                                storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    T tile_predecessor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = true;
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            input, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+    }
+
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads(Flag (&head_flags)[ItemsPerThread],
+                    T tile_predecessor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads(head_flags, tile_predecessor_item, input, flag_op, storage);
+    }
+
+    /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the last item has no reference and is always
+    /// flagged.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags       = true;
+        static constexpr auto reversed       = false;
+        static constexpr auto with_successor = false;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, tail_flags, flag_op, input[0] /* successor */, storage.get().right);
+    }
+
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_tails(tail_flags, input, flag_op, storage);
+    }
+
+    /// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
+    /// across the thread block, where the last item of the last thread is compared against
+    /// a \p tile_successor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
+    ///                                storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    T tile_successor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op,
+                    storage_type& storage)
+    {
+        static constexpr auto as_flags       = true;
+        static constexpr auto reversed       = false;
+        static constexpr auto with_successor = true;
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            input, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_tails(Flag (&tail_flags)[ItemsPerThread],
+                    T tile_successor_item,
+                    const T (&input)[ItemsPerThread],
+                    FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_tails(tail_flags, tile_successor_item, input, flag_op, storage);
+    }
+
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
+    ///                                          flag_op_type(), storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = false;
+        static constexpr auto with_successor   = false;
+
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
+    }
+
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tail_flags, input, flag_op, storage);
+    }
+
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the last item of the
+    /// last thread is compared against a \p tile_successor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = false;
+        static constexpr auto with_successor   = true;
+
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, flag_op, storage);
+    }
+
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the first item of the
+    /// first thread is compared against a \p tile_predecessor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = true;
+        static constexpr auto with_successor   = false;
+
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
+    }
+
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage);
+    }
+
+    /// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
+    /// between items partitioned across the thread block, where the first and last items of
+    /// the first and last thread is compared against a \p tile_predecessor_item and
+    /// a \p tile_successor_item.
+    ///
+    /// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+    /// each thread.
+    /// \tparam Flag - [inferred] the flag type.
+    /// \tparam FlagOp - [inferred] type of binary function used for flagging.
+    ///
+    /// \param [out] head_flags - array that contains the head flags.
+    /// \param [in] tile_predecessor_item - first tile item from thread to be compared
+    /// against.
+    /// \param [out] tail_flags - array that contains the tail flags.
+    /// \param [in] tile_successor_item - last tile item from thread to be compared
+    /// against.
+    /// \param [in] input - array that data is loaded from.
+    /// \param [in] flag_op - binary operation function object that will be used for flagging.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
+    /// The signature does not need to have <tt>const &</tt>, but function object
+    /// must not modify the objects passed to it.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize discontinuity for int and a block of 128 threads
+    ///     using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_discontinuity_int::storage_type storage;
+    ///
+    ///     // segment of consecutive items to be used
+    ///     int input[8];
+    ///     int tile_predecessor_item = 0;
+    ///     int tile_successor_item = 0;
+    ///     if (threadIdx.x == 0)
+    ///     {
+    ///         tile_predecessor_item = ...
+    ///         tile_successor_item = ...
+    ///     }
+    ///     ...
+    ///     int head_flags[8];
+    ///     int tail_flags[8];
+    ///     block_discontinuity_int b_discontinuity;
+    ///     using flag_op_type = typename rocprim::greater<int>;
+    ///     b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
+    ///                                          tail_flags, tile_successor_item,
+    ///                                          input, flag_op_type(),
+    ///                                          storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op,
+                              storage_type& storage)
+    {
+        static constexpr auto as_flags         = true;
+        static constexpr auto reversed         = false;
+        static constexpr auto with_predecessor = true;
+        static constexpr auto with_successor   = true;
+
+        // Copy items in case head_flags is aliased with input
+        T items[ItemsPerThread];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread; ++i) {
+            items[i] = input[i];
+        }
+
+        base_type::template apply_left<as_flags, reversed, with_predecessor>(
+            items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
+
+        base_type::template apply_right<as_flags, reversed, with_successor>(
+            items, tail_flags, flag_op, tile_successor_item, storage.get().right);
+    }
+
+    /// \overload
+    /// This overload does not accept a reference to temporary storage, instead it is declared as
+    /// part of the function itself. Note that this does NOT decrease the shared memory requirements
+    /// of a kernel using this function.
+    template<unsigned int ItemsPerThread, class Flag, class FlagOp>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
+                              T tile_predecessor_item,
+                              Flag (&tail_flags)[ItemsPerThread],
+                              T tile_successor_item,
+                              const T (&input)[ItemsPerThread],
+                              FlagOp flag_op)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        flag_heads_and_tails(
+            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+            input, flag_op, storage
+        );
+    }
+};
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
--- a/3rdparty/cub/rocprim/block/block_exchange.hpp
+++ b/3rdparty/cub/rocprim/block/block_exchange.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
+#define ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief The \p block_exchange class is a block level parallel primitive which provides
+/// methods for rearranging items partitioned across threads in a block.
+///
+/// \tparam T - the input type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam ItemsPerThread - the number of items contributed by each thread.
+///
+/// \par Overview
+/// * The \p block_exchange class supports the following rearrangement methods:
+///   * Transposing a blocked arrangement to a striped arrangement.
+///   * Transposing a striped arrangement to a blocked arrangement.
+///   * Transposing a blocked arrangement to a warp-striped arrangement.
+///   * Transposing a warp-striped arrangement to a blocked arrangement.
+///   * Scattering items to a blocked arrangement.
+///   * Scattering items to a striped arrangement.
+/// * Data is automatically be padded to ensure zero bank conflicts.
+///
+/// \par Examples
+/// \parblock
+/// In the examples exchange operation is performed on block of 128 threads, using type
+/// \p int with 8 items per thread.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+///     // allocate storage in shared memory
+///     __shared__ block_exchange_int::storage_type storage;
+///
+///     int items[8];
+///     ...
+///     block_exchange_int b_exchange;
+///     b_exchange.blocked_to_striped(items, items, storage);
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_exchange
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+    // Select warp size
+    static constexpr unsigned int warp_size =
+        detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
+    // Number of warps in block
+    static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size;
+
+    // Minimize LDS bank conflicts for power-of-two strides, i.e. when items accessed
+    // using `thread_id * ItemsPerThread` pattern where ItemsPerThread is power of two
+    // (all exchanges from/to blocked).
+    static constexpr bool has_bank_conflicts =
+        ItemsPerThread >= 2 && ::rocprim::detail::is_power_of_two(ItemsPerThread);
+    static constexpr unsigned int banks_no = ::rocprim::detail::get_lds_banks_no();
+    static constexpr unsigned int bank_conflicts_padding =
+        has_bank_conflicts ? (BlockSize * ItemsPerThread / banks_no) : 0;
+
+    // Struct used for creating a raw_storage object for this primitive's temporary storage.
+    struct storage_type_
+    {
+        T buffer[BlockSize * ItemsPerThread + bank_conflicts_padding];
+    };
+
+public:
+
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = detail::raw_storage<storage_type_>;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+
+    /// \brief Transposes a blocked arrangement of items to a striped arrangement
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void blocked_to_striped(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        blocked_to_striped(input, output, storage);
+    }
+
+    /// \brief Transposes a blocked arrangement of items to a striped arrangement
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.blocked_to_striped(items, items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void blocked_to_striped(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(flat_id * ItemsPerThread + i)] = input[i];
+        }
+        ::rocprim::syncthreads();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(i * BlockSize + flat_id)];
+        }
+    }
+
+    /// \brief Transposes a striped arrangement of items to a blocked arrangement
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void striped_to_blocked(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        striped_to_blocked(input, output, storage);
+    }
+
+    /// \brief Transposes a striped arrangement of items to a blocked arrangement
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.striped_to_blocked(items, items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void striped_to_blocked(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
+        }
+        ::rocprim::syncthreads();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
+        }
+    }
+
+    /// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
+                                 U (&output)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        blocked_to_warp_striped(input, output, storage);
+    }
+
+    /// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.blocked_to_warp_striped(items, items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
+                                 U (&output)[ItemsPerThread],
+                                 storage_type& storage)
+    {
+        constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
+        const unsigned int lane_id = ::rocprim::lane_id();
+        const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        const unsigned int current_warp_size = get_current_warp_size();
+        const unsigned int offset = warp_id * items_per_warp;
+        storage_type_& storage_ = storage.get();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(offset + lane_id * ItemsPerThread + i)] = input[i];
+        }
+
+        ::rocprim::wave_barrier();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(offset + i * current_warp_size + lane_id)];
+        }
+    }
+
+    /// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
+                                 U (&output)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        warp_striped_to_blocked(input, output, storage);
+    }
+
+    /// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.warp_striped_to_blocked(items, items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
+                                 U (&output)[ItemsPerThread],
+                                 storage_type& storage)
+    {
+        constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
+        const unsigned int lane_id = ::rocprim::lane_id();
+        const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        const unsigned int current_warp_size = get_current_warp_size();
+        const unsigned int offset = warp_id * items_per_warp;
+        storage_type_& storage_ = storage.get();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(offset + i * current_warp_size + lane_id)] = input[i];
+        }
+
+        ::rocprim::wave_barrier();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(offset + lane_id * ItemsPerThread + i)];
+        }
+    }
+
+    /// \brief Scatters items to a blocked arrangement based on their ranks
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [out] ranks - array that has rank of data.
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void scatter_to_blocked(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            const Offset (&ranks)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        scatter_to_blocked(input, output, ranks, storage);
+    }
+
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void gather_from_striped(const T (&input)[ItemsPerThread],
+                                   U (&output)[ItemsPerThread],
+                                   const Offset (&ranks)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        gather_from_striped(input, output, ranks, storage);
+    }
+
+    /// \brief Scatters items to a blocked arrangement based on their ranks
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [out] ranks - array that has rank of data.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     int ranks[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.scatter_to_blocked(items, items, ranks, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void scatter_to_blocked(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            const Offset (&ranks)[ItemsPerThread],
+                            storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            storage_.buffer[index(rank)] = input[i];
+        }
+        ::rocprim::syncthreads();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
+        }
+    }
+
+    template <class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void gather_from_striped(const T (&input)[ItemsPerThread],
+                             U (&output)[ItemsPerThread],
+                             const Offset (&ranks)[ItemsPerThread],
+                             storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
+        }
+        ::rocprim::syncthreads();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            output[i] = storage_.buffer[index(rank)];
+        }
+    }
+
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [out] ranks - array that has rank of data.
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void scatter_to_striped(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            const Offset (&ranks)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        scatter_to_striped(input, output, ranks, storage);
+    }
+
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, using temporary storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [out] ranks - array that has rank of data.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     int ranks[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.scatter_to_striped(items, items, ranks, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void scatter_to_striped(const T (&input)[ItemsPerThread],
+                            U (&output)[ItemsPerThread],
+                            const Offset (&ranks)[ItemsPerThread],
+                            storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            storage_.buffer[rank] = input[i];
+        }
+        ::rocprim::syncthreads();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[i * BlockSize + flat_id];
+        }
+    }
+
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, guarded by rank.
+    ///
+    /// \par Overview
+    /// * Items with rank -1 are not scattered.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] ranks - array that has rank of data.
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
+                                    U (&output)[ItemsPerThread],
+                                    const Offset (&ranks)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        scatter_to_striped_guarded(input, output, ranks, storage);
+    }
+
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, guarded by rank, using temporary storage.
+    ///
+    /// \par Overview
+    /// * Items with rank -1 are not scattered.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] ranks - array that has rank of data.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     int ranks[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.scatter_to_striped_guarded(items, items, ranks, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U, class Offset>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
+                                    U (&output)[ItemsPerThread],
+                                    const Offset (&ranks)[ItemsPerThread],
+                                    storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            if(rank >= 0)
+            {
+                storage_.buffer[rank] = input[i];
+            }
+        }
+        ::rocprim::syncthreads();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[i * BlockSize + flat_id];
+        }
+    }
+
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, with a flag to denote validity.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    /// \tparam ValidFlag - [inferred] the validity flag type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] ranks - array that has rank of data.
+    /// \param [in] is_valid - array that has flags to denote validity.
+    template<class U, class Offset, class ValidFlag>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
+                                    U (&output)[ItemsPerThread],
+                                    const Offset (&ranks)[ItemsPerThread],
+                                    const ValidFlag (&is_valid)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        scatter_to_striped_flagged(input, output, ranks, is_valid, storage);
+    }
+
+    /// \brief Scatters items to a striped arrangement based on their ranks
+    /// across the thread block, with a flag to denote validity, using temporary
+    /// storage.
+    ///
+    /// \tparam U - [inferred] the output type.
+    /// \tparam Offset - [inferred] the rank type.
+    /// \tparam ValidFlag - [inferred] the validity flag type.
+    ///
+    /// \param [in] input - array that data is loaded from.
+    /// \param [out] output - array that data is loaded to.
+    /// \param [in] ranks - array that has rank of data.
+    /// \param [in] is_valid - array that has flags to denote validity.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_exchange for int, block of 128 threads and 8 items per thread
+    ///     using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_exchange_int::storage_type storage;
+    ///
+    ///     int items[8];
+    ///     int ranks[8];
+    ///     int flags[8];
+    ///     ...
+    ///     block_exchange_int b_exchange;
+    ///     b_exchange.scatter_to_striped_flagged(items, items, ranks, flags, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class U, class Offset, class ValidFlag>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
+                                    U (&output)[ItemsPerThread],
+                                    const Offset (&ranks)[ItemsPerThread],
+                                    const ValidFlag (&is_valid)[ItemsPerThread],
+                                    storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        storage_type_& storage_ = storage.get();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            const Offset rank = ranks[i];
+            if(is_valid[i])
+            {
+                storage_.buffer[rank] = input[i];
+            }
+        }
+        ::rocprim::syncthreads();
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            output[i] = storage_.buffer[i * BlockSize + flat_id];
+        }
+    }
+
+private:
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int get_current_warp_size() const
+    {
+        const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        return (warp_id == warps_no - 1)
+            ? (BlockSize % warp_size > 0 ? BlockSize % warp_size : warp_size)
+            : warp_size;
+    }
+
+    // Change index to minimize LDS bank conflicts if necessary
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    unsigned int index(unsigned int n)
+    {
+        // Move every 32-bank wide "row" (32 banks * 4 bytes) by one item
+        return has_bank_conflicts ? (n + n / banks_no) : n;
+    }
+};
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
--- a/3rdparty/cub/rocprim/block/block_histogram.hpp
+++ b/3rdparty/cub/rocprim/block/block_histogram.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
+#define ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+
+#include "detail/block_histogram_atomic.hpp"
+#include "detail/block_histogram_sort.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup blockmodule
+/// @{
+
+/// \brief Available algorithms for block_histogram primitive.
+enum class block_histogram_algorithm
+{
+    /// Atomic addition is used to update bin count directly.
+    /// \par Performance Notes:
+    /// * Performance is dependent on hardware implementation of atomic addition.
+    /// * Performance may decrease for non-uniform random input distributions
+    /// where many concurrent updates may be made to the same bin counter.
+    using_atomic,
+
+    /// A two-phase operation is used:-
+    /// * Data is sorted using radix-sort.
+    /// * "Runs" of same-valued keys are detected using discontinuity; run-lengths
+    /// are bin counts.
+    /// \par Performance Notes:
+    /// * Performance is consistent regardless of sample bin distribution.
+    using_sort,
+
+    /// \brief Default block_histogram algorithm.
+    default_algorithm = using_atomic,
+};
+
+namespace detail
+{
+
+// Selector for block_histogram algorithm which gives block histogram implementation
+// type based on passed block_histogram_algorithm enum
+template<block_histogram_algorithm Algorithm>
+struct select_block_histogram_impl;
+
+template<>
+struct select_block_histogram_impl<block_histogram_algorithm::using_atomic>
+{
+    template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, unsigned int ItemsPerThread, unsigned int Bins>
+    using type = block_histogram_atomic<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
+};
+
+template<>
+struct select_block_histogram_impl<block_histogram_algorithm::using_sort>
+{
+    template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, unsigned int ItemsPerThread, unsigned int Bins>
+    using type = block_histogram_sort<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
+};
+
+} // end namespace detail
+
+/// \brief The block_histogram class is a block level parallel primitive which provides methods
+/// for constructing block-wide histograms from items partitioned across threads in a block.
+///
+/// \tparam T - the input/output type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam ItemsPerThread - the number of items to be processed by each thread.
+/// \tparam Bins - the number of bins within the histogram.
+/// \tparam Algorithm - selected histogram algorithm, block_histogram_algorithm::default_algorithm by default.
+///
+/// \par Overview
+/// * block_histogram has two alternative implementations: \p block_histogram_algorithm::using_atomic
+///   and block_histogram_algorithm::using_sort.
+///
+/// \par Examples
+/// \parblock
+/// In the examples histogram operation is performed on block of 192 threads, each provides
+/// one \p int value, result is returned using the same variable as for input.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize block_histogram for int, logical block of 192 threads,
+///     // 2 items per thread and a bin size of 192.
+///     using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
+///     // allocate storage in shared memory
+///     __shared__ block_histogram_int::storage_type storage;
+///     __shared__ int hist[192];
+///
+///     int value[2];
+///     ...
+///     // execute histogram
+///     block_histogram_int().histogram(
+///         value, // input
+///         hist, // output
+///         storage
+///     );
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int Bins,
+    block_histogram_algorithm Algorithm = block_histogram_algorithm::default_algorithm,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_histogram
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    : private detail::select_block_histogram_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>
+#endif
+{
+    using base_type = typename detail::select_block_histogram_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    using storage_type = typename base_type::storage_type;
+
+    /// \brief Initialize histogram counters to zero.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [out] hist - histogram bin count.
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void init_histogram(Counter hist[Bins])
+    {
+        const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+
+        ROCPRIM_UNROLL
+        for(unsigned int offset = 0; offset < Bins; offset += BlockSize)
+        {
+            const unsigned int offset_tid = offset + flat_tid;
+            if(offset_tid < Bins)
+            {
+                hist[offset_tid] = Counter();
+            }
+        }
+    }
+
+    /// \brief Update an existing block-wide histogram. Each thread composites an array of
+    /// input elements.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] hist - histogram bin count.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples histogram operation is performed on block of 192 threads, each provides
+    /// one \p int value, result is returned using the same variable as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_histogram for int, logical block of 192 threads,
+    ///     // 2 items per thread and a bin size of 192.
+    ///     using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_histogram_int::storage_type storage;
+    ///     __shared__ int hist[192];
+    ///
+    ///     int value[2];
+    ///     ...
+    ///     // initialize histogram
+    ///     block_histogram_int().init_histogram(
+    ///         hist // output
+    ///     );
+    ///
+    ///     rocprim::syncthreads();
+    ///
+    ///     // update histogram
+    ///     block_histogram_int().composite(
+    ///         value, // input
+    ///         hist, // output
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    /// \endparblock
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void composite(T (&input)[ItemsPerThread],
+                   Counter hist[Bins],
+                   storage_type& storage)
+    {
+        base_type::composite(input, hist, storage);
+    }
+
+    /// \overload
+    /// \brief Update an existing block-wide histogram. Each thread composites an array of
+    /// input elements.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] hist - histogram bin count.
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void composite(T (&input)[ItemsPerThread],
+                   Counter hist[Bins])
+    {
+        base_type::composite(input, hist);
+    }
+
+    /// \brief Construct a new block-wide histogram. Each thread contributes an array of
+    /// input elements.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] hist - histogram bin count.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples histogram operation is performed on block of 192 threads, each provides
+    /// one \p int value, result is returned using the same variable as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_histogram for int, logical block of 192 threads,
+    ///     // 2 items per thread and a bin size of 192.
+    ///     using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_histogram_int::storage_type storage;
+    ///     __shared__ int hist[192];
+    ///
+    ///     int value[2];
+    ///     ...
+    ///     // execute histogram
+    ///     block_histogram_int().histogram(
+    ///         value, // input
+    ///         hist, // output
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    /// \endparblock
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void histogram(T (&input)[ItemsPerThread],
+                   Counter hist[Bins],
+                   storage_type& storage)
+    {
+        init_histogram(hist);
+        ::rocprim::syncthreads();
+        composite(input, hist, storage);
+    }
+
+    /// \overload
+    /// \brief Construct a new block-wide histogram. Each thread contributes an array of
+    /// input elements.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam Counter - [inferred] counter type of histogram.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] hist - histogram bin count.
+    template<class Counter>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void histogram(T (&input)[ItemsPerThread],
+                   Counter hist[Bins])
+    {
+        init_histogram(hist);
+        ::rocprim::syncthreads();
+        composite(input, hist);
+    }
+};
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
--- a/3rdparty/cub/rocprim/block/block_load.hpp
+++ b/3rdparty/cub/rocprim/block/block_load.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
+#define ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+
+#include "block_load_func.hpp"
+#include "block_exchange.hpp"
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief \p block_load_method enumerates the methods available to load data
+/// from continuous memory into a blocked arrangement of items across the thread block
+enum class block_load_method
+{
+    /// Data from continuous memory is loaded into a blocked arrangement of items.
+    /// \par Performance Notes:
+    /// * Performance decreases with increasing number of items per thread (stride
+    /// between reads), because of reduced memory coalescing.
+    block_load_direct,
+
+    /// A striped arrangement of data is read directly from memory.
+    block_load_striped,
+
+    /// Data from continuous memory is loaded into a blocked arrangement of items
+    /// using vectorization as an optimization.
+    /// \par Performance Notes:
+    /// * Performance remains high due to increased memory coalescing, provided that
+    /// vectorization requirements are fulfilled. Otherwise, performance will default
+    /// to \p block_load_direct.
+    /// \par Requirements:
+    /// * The input offset (\p block_input) must be quad-item aligned.
+    /// * The following conditions will prevent vectorization and switch to default
+    /// \p block_load_direct:
+    ///   * \p ItemsPerThread is odd.
+    ///   * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+    /// int4, etc.
+    block_load_vectorize,
+
+    /// A striped arrangement of data from continuous memory is locally transposed
+    /// into a blocked arrangement of items.
+    /// \par Performance Notes:
+    /// * Performance remains high due to increased memory coalescing, regardless of the
+    /// number of items per thread.
+    /// * Performance may be better compared to \p block_load_direct and
+    /// \p block_load_vectorize due to reordering on local memory.
+    block_load_transpose,
+
+    /// A warp-striped arrangement of data from continuous memory is locally transposed
+    /// into a blocked arrangement of items.
+    /// \par Requirements:
+    /// * The number of threads in the block must be a multiple of the size of hardware warp.
+    /// \par Performance Notes:
+    /// * Performance remains high due to increased memory coalescing, regardless of the
+    /// number of items per thread.
+    /// * Performance may be better compared to \p block_load_direct and
+    /// \p block_load_vectorize due to reordering on local memory.
+    block_load_warp_transpose,
+
+    /// Defaults to \p block_load_direct
+    default_method = block_load_direct
+};
+
+/// \brief The \p block_load class is a block level parallel primitive which provides methods
+/// for loading data from continuous memory into a blocked arrangement of items across the thread
+/// block.
+///
+/// \tparam T - the input/output type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam ItemsPerThread - the number of items to be processed by
+/// each thread.
+/// \tparam Method - the method to load data.
+///
+/// \par Overview
+/// * The \p block_load class has a number of different methods to load data:
+///   * [block_load_direct](\ref ::block_load_method::block_load_direct)
+///   * [block_load_striped](\ref ::block_load_method::block_load_striped)
+///   * [block_load_vectorize](\ref ::block_load_method::block_load_vectorize)
+///   * [block_load_transpose](\ref ::block_load_method::block_load_transpose)
+///   * [block_load_warp_transpose](\ref ::block_load_method::block_load_warp_transpose)
+///
+/// \par Example:
+/// \parblock
+/// In the examples load operation is performed on block of 128 threads, using type
+/// \p int and 8 items per thread.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(int * input, ...)
+/// {
+///     const int offset = blockIdx.x * 128 * 8;
+///     int items[8];
+///     rocprim::block_load<int, 128, 8, load_method> blockload;
+///     blockload.load(input + offset, items);
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    block_load_method Method = block_load_method::block_load_direct,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_load
+{
+private:
+    using storage_type_ = typename ::rocprim::detail::empty_storage_type;
+
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords \p __shared__. It can be aliased to
+    /// an externally allocated memory, or be a part of a union with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = typename ::rocprim::detail::empty_storage_type;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items);
+    }
+
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, which is guarded by range \p valid.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] valid - maximum range of valid numbers to load.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items, valid);
+    }
+
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, which is guarded by range with a fall-back value for out-of-bound
+    /// elements.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    /// \tparam Default - [inferred] The data type of the default value.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] valid - maximum range of valid numbers to load.
+    /// \param [in] out_of_bounds - default value assigned to out-of-bound items.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items, valid,
+                                  out_of_bounds);
+    }
+
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, using temporary storage.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] storage - temporary storage for inputs.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     int items[8];
+    ///     using block_load_int = rocprim::block_load<int, 128, 8>;
+    ///     block_load_int bload;
+    ///     __shared__ typename block_load_int::storage_type storage;
+    ///     bload.load(..., items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items);
+    }
+
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, which is guarded by range \p valid, using temporary storage.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] valid - maximum range of valid numbers to load.
+    /// \param [in] storage - temporary storage for inputs.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     int items[8];
+    ///     using block_load_int = rocprim::block_load<int, 128, 8>;
+    ///     block_load_int bload;
+    ///     tile_static typename block_load_int::storage_type storage;
+    ///     bload.load(..., items, valid, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items, valid);
+    }
+
+    /// \brief Loads data from continuous memory into an arrangement of items across the
+    /// thread block, which is guarded by range with a fall-back value for out-of-bound
+    /// elements, using temporary storage.
+    ///
+    /// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+    /// pointer.
+    /// \tparam Default - [inferred] The data type of the default value.
+    ///
+    /// \param [in] block_input - the input iterator from the thread block to load from.
+    /// \param [out] items - array that data is loaded to.
+    /// \param [in] valid - maximum range of valid numbers to load.
+    /// \param [in] out_of_bounds - default value assigned to out-of-bound items.
+    /// \param [in] storage - temporary storage for inputs.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     int items[8];
+    ///     using block_load_int = rocprim::block_load<int, 128, 8>;
+    ///     block_load_int bload;
+    ///     __shared__ typename block_load_int::storage_type storage;
+    ///     bload.load(..., items, valid, out_of_bounds, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items, valid, out_of_bounds);
+    }
+};
+
+/// @}
+// end of group blockmodule
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+    >
+class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_striped, BlockSizeY, BlockSizeZ>
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+
+private:
+    using storage_type_ = typename ::rocprim::detail::empty_storage_type;
+
+public:
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = typename ::rocprim::detail::empty_storage_type;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+    }
+
+    template<
+        class InputIterator,
+        class Default
+        >
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
+                                             out_of_bounds);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+    }
+
+    template<
+        class InputIterator,
+        class Default
+        >
+    ROCPRIM_DEVICE inline
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
+                                             out_of_bounds);
+    }
+};
+
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+>
+class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_vectorize, BlockSizeY, BlockSizeZ>
+{
+private:
+    using storage_type_ = typename ::rocprim::detail::empty_storage_type;
+
+public:
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = typename ::rocprim::detail::empty_storage_type;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(T* block_input,
+              T (&_items)[ItemsPerThread])
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked_vectorized(flat_id, block_input, _items);
+    }
+
+    template<class InputIterator, class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              U (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items, valid);
+    }
+
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_blocked(flat_id, block_input, items, valid,
+                                  out_of_bounds);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(T* block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        (void) storage;
+        load(block_input, items);
+    }
+
+    template<class InputIterator, class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              U (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items, valid);
+    }
+
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        (void) storage;
+        load(block_input, items, valid, out_of_bounds);
+    }
+};
+
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+>
+class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_transpose, BlockSizeY, BlockSizeZ>
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+
+private:
+    using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;
+
+public:
+    using storage_type = typename block_exchange_type::storage_type;
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
+                                             out_of_bounds);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
+                                             out_of_bounds);
+        block_exchange_type().striped_to_blocked(items, items, storage);
+    }
+};
+
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+>
+class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_warp_transpose, BlockSizeY, BlockSizeZ>
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+private:
+    using block_exchange_type = block_exchange<T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
+
+public:
+    static_assert(BlockSize % ::rocprim::device_warp_size() == 0,
+                 "BlockSize must be a multiple of hardware warpsize");
+
+    using storage_type = typename block_exchange_type::storage_type;
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread])
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items, valid);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+
+    }
+
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items, valid,
+                                       out_of_bounds);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+
+    template<class InputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items, valid);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+
+    template<
+        class InputIterator,
+        class Default
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void load(InputIterator block_input,
+              T (&items)[ItemsPerThread],
+              unsigned int valid,
+              Default out_of_bounds,
+              storage_type& storage)
+    {
+        using value_type = typename std::iterator_traits<InputIterator>::value_type;
+        static_assert(std::is_convertible<value_type, T>::value,
+                      "The type T must be such that an object of type InputIterator "
+                      "can be dereferenced and then implicitly converted to T.");
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_load_direct_warp_striped(flat_id, block_input, items, valid,
+                                       out_of_bounds);
+        block_exchange_type().warp_striped_to_blocked(items, items, storage);
+    }
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
--- a/3rdparty/cub/rocprim/block/block_load_func.hpp
+++ b/3rdparty/cub/rocprim/block/block_load_func.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
+#define ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup blockmodule
+/// @{
+
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+template<
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_blocked(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread])
+{
+    unsigned int offset = flat_id * ItemsPerThread;
+    InputIterator thread_iter = block_input + offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = thread_iter[item];
+    }
+}
+
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block, which is guarded by range \p valid.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+template<
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_blocked(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread],
+                               unsigned int valid)
+{
+    unsigned int offset = flat_id * ItemsPerThread;
+    InputIterator thread_iter = block_input + offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        if (item + offset < valid)
+        {
+            items[item] = thread_iter[item];
+        }
+    }
+}
+
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block, which is guarded by range with a fall-back value
+/// for out-of-bound elements.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+/// \tparam Default - [inferred] The data type of the default value
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+/// \param out_of_bounds - default value assigned to out-of-bound items
+template<
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread,
+    class Default
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_blocked(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread],
+                               unsigned int valid,
+                               Default out_of_bounds)
+{
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = static_cast<T>(out_of_bounds);
+    }
+    // TODO: Consider using std::fill for HIP-CPU, as uses memset() where appropriate
+
+    block_load_direct_blocked(flat_id, block_input, items, valid);
+}
+
+/// \brief Loads data from continuous memory into a blocked arrangement of items
+/// across the thread block.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// The input offset (\p block_input + offset) must be quad-item aligned.
+///
+/// The following conditions will prevent vectorization and switch to default
+/// block_load_direct_blocked:
+/// * \p ItemsPerThread is odd.
+/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+/// int4, etc.
+///
+/// \tparam T - [inferred] the input data type
+/// \tparam U - [inferred] the output data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// The type \p T must be such that it can be implicitly converted to \p U.
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+template<
+    class T,
+    class U,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto
+block_load_direct_blocked_vectorized(unsigned int flat_id,
+                                     T* block_input,
+                                     U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type
+{
+    typedef typename detail::match_vector_type<T, ItemsPerThread>::type vector_type;
+    constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type);
+    vector_type vector_items[vectors_per_thread];
+
+    const vector_type* vector_ptr = reinterpret_cast<const vector_type*>(block_input) +
+        (flat_id * vectors_per_thread);
+
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < vectors_per_thread; item++)
+    {
+        vector_items[item] = *(vector_ptr + item);
+    }
+
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = *(reinterpret_cast<T*>(vector_items) + item);
+    }
+}
+
+template<
+    class T,
+    class U,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto
+block_load_direct_blocked_vectorized(unsigned int flat_id,
+                                     T* block_input,
+                                     U (&items)[ItemsPerThread]) -> typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>::value>::type
+{
+    block_load_direct_blocked(flat_id, block_input, items);
+}
+
+/// \brief Loads data from continuous memory into a striped arrangement of items
+/// across the thread block.
+///
+/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam BlockSize - the number of threads in a block
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+template<
+    unsigned int BlockSize,
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_striped(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread])
+{
+    InputIterator thread_iter = block_input + flat_id;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = thread_iter[item * BlockSize];
+    }
+}
+
+/// \brief Loads data from continuous memory into a striped arrangement of items
+/// across the thread block, which is guarded by range \p valid.
+///
+/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam BlockSize - the number of threads in a block
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+template<
+    unsigned int BlockSize,
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_striped(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread],
+                               unsigned int valid)
+{
+    InputIterator thread_iter = block_input + flat_id;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        unsigned int offset = item * BlockSize;
+        if (flat_id + offset < valid)
+        {
+            items[item] = thread_iter[offset];
+        }
+    }
+}
+
+/// \brief Loads data from continuous memory into a striped arrangement of items
+/// across the thread block, which is guarded by range with a fall-back value
+/// for out-of-bound elements.
+///
+/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// \tparam BlockSize - the number of threads in a block
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+/// \tparam Default - [inferred] The data type of the default value
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+/// \param out_of_bounds - default value assigned to out-of-bound items
+template<
+    unsigned int BlockSize,
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread,
+    class Default
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_striped(unsigned int flat_id,
+                               InputIterator block_input,
+                               T (&items)[ItemsPerThread],
+                               unsigned int valid,
+                               Default out_of_bounds)
+{
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = out_of_bounds;
+    }
+
+    block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
+}
+
+/// \brief Loads data from continuous memory into a warp-striped arrangement of items
+/// across the thread block.
+///
+/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// * The number of threads in the block must be a multiple of \p WarpSize.
+/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
+/// * \p WarpSize must be a power of two and equal or less than the size of
+///   hardware warp.
+/// * Using \p WarpSize smaller than hardware warpsize could result in lower
+///   performance.
+///
+/// \tparam WarpSize - [optional] the number of threads in a warp
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+template<
+    unsigned int WarpSize = device_warp_size(),
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_warp_striped(unsigned int flat_id,
+                                    InputIterator block_input,
+                                    T (&items)[ItemsPerThread])
+{
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
+                 "WarpSize must be a power of two and equal or less"
+                 "than the size of hardware warp.");
+    unsigned int thread_id = detail::logical_lane_id<WarpSize>();
+    unsigned int warp_id = flat_id / WarpSize;
+    unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
+
+    InputIterator thread_iter = block_input + thread_id + warp_offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = thread_iter[item * WarpSize];
+    }
+}
+
+/// \brief Loads data from continuous memory into a warp-striped arrangement of items
+/// across the thread block, which is guarded by range \p valid.
+///
+/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// * The number of threads in the block must be a multiple of \p WarpSize.
+/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
+/// * \p WarpSize must be a power of two and equal or less than the size of
+///   hardware warp.
+/// * Using \p WarpSize smaller than hardware warpsize could result in lower
+///   performance.
+///
+/// \tparam WarpSize - [optional] the number of threads in a warp
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+template<
+    unsigned int WarpSize = device_warp_size(),
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_warp_striped(unsigned int flat_id,
+                                    InputIterator block_input,
+                                    T (&items)[ItemsPerThread],
+                                    unsigned int valid)
+{
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
+                 "WarpSize must be a power of two and equal or less"
+                 "than the size of hardware warp.");
+    unsigned int thread_id = detail::logical_lane_id<WarpSize>();
+    unsigned int warp_id = flat_id / WarpSize;
+    unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
+
+    InputIterator thread_iter = block_input + thread_id + warp_offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        unsigned int offset = item * WarpSize;
+        if (warp_offset + thread_id + offset < valid)
+        {
+            items[item] = thread_iter[offset];
+        }
+    }
+}
+
+/// \brief Loads data from continuous memory into a warp-striped arrangement of items
+/// across the thread block, which is guarded by range with a fall-back value
+/// for out-of-bound elements.
+///
+/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to load a range of
+/// \p ItemsPerThread into \p items.
+///
+/// * The number of threads in the block must be a multiple of \p WarpSize.
+/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
+/// * \p WarpSize must be a power of two and equal or less than the size of
+///   hardware warp.
+/// * Using \p WarpSize smaller than hardware warpsize could result in lower
+///   performance.
+///
+/// \tparam WarpSize - [optional] the number of threads in a warp
+/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+/// \tparam Default - [inferred] The data type of the default value
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_input - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+/// \param valid - maximum range of valid numbers to load
+/// \param out_of_bounds - default value assigned to out-of-bound items
+template<
+    unsigned int WarpSize = device_warp_size(),
+    class InputIterator,
+    class T,
+    unsigned int ItemsPerThread,
+    class Default
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_load_direct_warp_striped(unsigned int flat_id,
+                                    InputIterator block_input,
+                                    T (&items)[ItemsPerThread],
+                                    unsigned int valid,
+                                    Default out_of_bounds)
+{
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
+                 "WarpSize must be a power of two and equal or less"
+                 "than the size of hardware warp.");
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        items[item] = out_of_bounds;
+    }
+
+    block_load_direct_warp_striped<WarpSize>(flat_id, block_input, items, valid);
+}
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
--- a/3rdparty/cub/rocprim/block/block_radix_sort.hpp
+++ b/3rdparty/cub/rocprim/block/block_radix_sort.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
+#define ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+#include "../detail/radix_sort.hpp"
+#include "../warp/detail/warp_scan_crosslane.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+
+#include "block_exchange.hpp"
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+
+/// Specialized block scan of bool (1 bit values)
+/// It uses warp scan and reduce functions of bool (1 bit values) based on ballot and bit count.
+/// They have much better performance (several times faster) than generic scan and reduce classes
+/// because of using hardware ability to calculate which lanes have true predicate values.
+template<
+    unsigned int BlockSizeX,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_bit_plus_scan
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+    // Select warp size
+    static constexpr unsigned int warp_size =
+        detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
+    // Number of warps in block
+    static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size;
+
+    // typedef of warp_scan primitive that will be used to get prefix values for
+    // each warp (scanned carry-outs from warps before it)
+    // warp_scan_crosslane is an implementation of warp_scan that does not need storage,
+    // but requires logical warp size to be a power of two.
+    using warp_scan_prefix_type =
+        ::rocprim::detail::warp_scan_crosslane<unsigned int, detail::next_power_of_two(warps_no)>;
+
+public:
+
+    struct storage_type_
+    {
+        unsigned int warp_prefixes[warps_no];
+        // ---------- Shared memory optimisation ----------
+        // Since we use warp_scan_crosslane for warp scan, we don't need to allocate
+        // any temporary memory for it.
+    };
+
+    using storage_type = detail::raw_storage<storage_type_>;
+
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exclusive_scan(const unsigned int (&input)[ItemsPerThread],
+                        unsigned int (&output)[ItemsPerThread],
+                        unsigned int& reduction,
+                        storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        const unsigned int lane_id = ::rocprim::lane_id();
+        const unsigned int warp_id = ::rocprim::warp_id(flat_id);
+        storage_type_& storage_ = storage.get();
+
+        unsigned int warp_reduction = ::rocprim::bit_count(::rocprim::ballot(input[0]));
+        for(unsigned int i = 1; i < ItemsPerThread; i++)
+        {
+            warp_reduction += ::rocprim::bit_count(::rocprim::ballot(input[i]));
+        }
+        if(lane_id == 0)
+        {
+            storage_.warp_prefixes[warp_id] = warp_reduction;
+        }
+        ::rocprim::syncthreads();
+
+        // Scan the warp reduction results to calculate warp prefixes
+        if(flat_id < warps_no)
+        {
+            unsigned int prefix = storage_.warp_prefixes[flat_id];
+            warp_scan_prefix_type().inclusive_scan(prefix, prefix, ::rocprim::plus<unsigned int>());
+            storage_.warp_prefixes[flat_id] = prefix;
+        }
+#ifdef __HIP_CPU_RT__
+        else
+        {
+            // HIP-CPU doesn't implement lockstep behavior. Need to invoke the same number sync ops in divergent branch.
+            empty_type empty;
+            ::rocprim::detail::warp_scan_crosslane<empty_type, detail::next_power_of_two(warps_no)>().inclusive_scan(empty, empty, empty_binary_op{});
+        }
+#endif
+        ::rocprim::syncthreads();
+
+        // Perform exclusive warp scan of bit values
+        unsigned int lane_prefix = 0;
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            lane_prefix = ::rocprim::masked_bit_count(::rocprim::ballot(input[i]), lane_prefix);
+        }
+
+        // Scan the lane's items and calculate final scan results
+        output[0] = warp_id == 0
+            ? lane_prefix
+            : lane_prefix + storage_.warp_prefixes[warp_id - 1];
+        for(unsigned int i = 1; i < ItemsPerThread; i++)
+        {
+            output[i] = output[i - 1] + input[i - 1];
+        }
+
+        // Get the final inclusive reduction result
+        reduction = storage_.warp_prefixes[warps_no - 1];
+    }
+};
+
+} // end namespace detail
+
+/// \brief The block_radix_sort class is a block level parallel primitive which provides
+/// methods sorting items (keys or key-value pairs) partitioned across threads in a block
+/// using radix sort algorithm.
+///
+/// \tparam Key - the key type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam ItemsPerThread - the number of items contributed by each thread.
+/// \tparam Value - the value type. Default type empty_type indicates
+/// a keys-only sort.
+///
+/// \par Overview
+/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
+/// type).
+/// * Performance depends on \p BlockSize and \p ItemsPerThread.
+///   * It is usually better of \p BlockSize is a multiple of the size of the hardware warp.
+///   * It is usually increased when \p ItemsPerThread is greater than one. However, when there
+///   are too many items per thread, each thread may need so much registers and/or shared memory
+///   that occupancy will fall too low, decreasing the performance.
+///   * If \p Key is an integer type and the range of keys is known in advance, the performance
+///   can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
+///   [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
+///
+/// \par Examples
+/// \parblock
+/// In the examples radix sort is performed on a block of 256 threads, each thread provides
+/// eight \p int value, results are returned using the same array as for input.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize block_radix_sort for int, block of 256 threads,
+///     // and eight items per thread; key-only sort
+///     using block_rsort_int = rocprim::block_radix_sort<int, 256, 8>;
+///     // allocate storage in shared memory
+///     __shared__ block_rsort_int::storage_type storage;
+///
+///     int input[8] = ...;
+///     // execute block radix sort (ascending)
+///     block_rsort_int().sort(
+///         input,
+///         storage
+///     );
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class Key,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    class Value = empty_type,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_radix_sort
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+    static constexpr bool with_values = !std::is_same<Value, empty_type>::value;
+
+    using bit_key_type = typename ::rocprim::detail::radix_key_codec<Key>::bit_key_type;
+    using bit_block_scan = detail::block_bit_plus_scan<BlockSizeX, BlockSizeY, BlockSizeZ>;
+
+    using bit_keys_exchange_type = ::rocprim::block_exchange<bit_key_type, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
+    using values_exchange_type = ::rocprim::block_exchange<Value, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
+
+    // Struct used for creating a raw_storage object for this primitive's temporary storage.
+    struct storage_type_
+    {
+        union
+        {
+            typename bit_keys_exchange_type::storage_type bit_keys_exchange;
+            typename values_exchange_type::storage_type values_exchange;
+        };
+        typename block_radix_sort<Key,BlockSizeX,ItemsPerThread,Value,BlockSizeY,BlockSizeZ>::bit_block_scan::storage_type bit_block_scan;
+    };
+
+public:
+
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = detail::raw_storage<storage_type_>;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+
+    /// \brief Performs ascending radix sort over keys partitioned across threads in a block.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples radix sort is performed on a block of 128 threads, each thread provides
+    /// two \p float value, results are returned using the same array as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_radix_sort for float, block of 128 threads,
+    ///     // and two items per thread; key-only sort
+    ///     using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_rsort_float::storage_type storage;
+    ///
+    ///     float input[2] = ...;
+    ///     // execute block radix sort (ascending)
+    ///     block_rsort_float().sort(
+    ///         input,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{[256, 255], ..., [4, 3], [2, 1]}}</tt>, then
+    /// then after sort they will be equal <tt>{[1, 2], [3, 4]  ..., [255, 256]}</tt>.
+    /// \endparblock
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort(Key (&keys)[ItemsPerThread],
+              storage_type& storage,
+              unsigned int begin_bit = 0,
+              unsigned int end_bit = 8 * sizeof(Key))
+    {
+        empty_type values[ItemsPerThread];
+        sort_impl<false>(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \overload
+    /// \brief Performs ascending radix sort over keys partitioned across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort(Key (&keys)[ItemsPerThread],
+              unsigned int begin_bit = 0,
+              unsigned int end_bit = 8 * sizeof(Key))
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        sort(keys, storage, begin_bit, end_bit);
+    }
+
+    /// \brief Performs descending radix sort over keys partitioned across threads in a block.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples radix sort is performed on a block of 128 threads, each thread provides
+    /// two \p float value, results are returned using the same array as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_radix_sort for float, block of 128 threads,
+    ///     // and two items per thread; key-only sort
+    ///     using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_rsort_float::storage_type storage;
+    ///
+    ///     float input[2] = ...;
+    ///     // execute block radix sort (descending)
+    ///     block_rsort_float().sort_desc(
+    ///         input,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{[1, 2], [3, 4]  ..., [255, 256]}</tt>,
+    /// then after sort they will be equal <tt>{[256, 255], ..., [4, 3], [2, 1]}</tt>.
+    /// \endparblock
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort_desc(Key (&keys)[ItemsPerThread],
+                   storage_type& storage,
+                   unsigned int begin_bit = 0,
+                   unsigned int end_bit = 8 * sizeof(Key))
+    {
+        empty_type values[ItemsPerThread];
+        sort_impl<true>(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \overload
+    /// \brief Performs descending radix sort over keys partitioned across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort_desc(Key (&keys)[ItemsPerThread],
+                   unsigned int begin_bit = 0,
+                   unsigned int end_bit = 8 * sizeof(Key))
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        sort_desc(keys, storage, begin_bit, end_bit);
+    }
+
+    /// \brief Performs ascending radix sort over key-value pairs partitioned across
+    /// threads in a block.
+    ///
+    /// \pre Method is enabled only if \p Value type is different than empty_type.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in, out] values - reference to an array of values provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples radix sort is performed on a block of 128 threads, each thread provides
+    /// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
+    /// arrays as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_radix_sort for int-float pairs, block of 128
+    ///     // threads, and two items per thread
+    ///     using block_rsort_ii = rocprim::block_radix_sort<int, 128, 2, int>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_rsort_ii::storage_type storage;
+    ///
+    ///     int keys[2] = ...;
+    ///     float values[2] = ...;
+    ///     // execute block radix sort-by-key (ascending)
+    ///     block_rsort_ii().sort(
+    ///         keys, values,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p keys across threads in a block are <tt>{[256, 255], ..., [4, 3], [2, 1]}</tt> and
+    /// the \p values are <tt>{[1, 1], [2, 2]  ..., [128, 128]}</tt>, then after sort the \p keys
+    /// will be equal <tt>{[1, 2], [3, 4]  ..., [255, 256]}</tt> and the \p values will be
+    /// equal <tt>{[128, 128], [127, 127]  ..., [2, 2], [1, 1]}</tt>.
+    /// \endparblock
+    template<bool WithValues = with_values>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort(Key (&keys)[ItemsPerThread],
+              typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
+              storage_type& storage,
+              unsigned int begin_bit = 0,
+              unsigned int end_bit = 8 * sizeof(Key))
+    {
+        sort_impl<false>(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \overload
+    /// \brief Performs ascending radix sort over key-value pairs partitioned across
+    /// threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \pre Method is enabled only if \p Value type is different than empty_type.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in, out] values - reference to an array of values provided by a thread.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    template<bool WithValues = with_values>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort(Key (&keys)[ItemsPerThread],
+              typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
+              unsigned int begin_bit = 0,
+              unsigned int end_bit = 8 * sizeof(Key))
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        sort(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \brief Performs descending radix sort over key-value pairs partitioned across
+    /// threads in a block.
+    ///
+    /// \pre Method is enabled only if \p Value type is different than empty_type.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in, out] values - reference to an array of values provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples radix sort is performed on a block of 128 threads, each thread provides
+    /// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
+    /// arrays as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_radix_sort for int-float pairs, block of 128
+    ///     // threads, and two items per thread
+    ///     using block_rsort_ii = rocprim::block_radix_sort<int, 128, 2, int>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_rsort_ii::storage_type storage;
+    ///
+    ///     int keys[2] = ...;
+    ///     float values[2] = ...;
+    ///     // execute block radix sort-by-key (descending)
+    ///     block_rsort_ii().sort_desc(
+    ///         keys, values,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p keys across threads in a block are <tt>{[1, 2], [3, 4]  ..., [255, 256]}</tt> and
+    /// the \p values are <tt>{[128, 128], [127, 127]  ..., [2, 2], [1, 1]}</tt>, then after sort
+    /// the \p keys will be equal <tt>{[256, 255], ..., [4, 3], [2, 1]}</tt> and the \p values
+    /// will be equal <tt>{[1, 1], [2, 2]  ..., [128, 128]}</tt>.
+    /// \endparblock
+    template<bool WithValues = with_values>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort_desc(Key (&keys)[ItemsPerThread],
+                   typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
+                   storage_type& storage,
+                   unsigned int begin_bit = 0,
+                   unsigned int end_bit = 8 * sizeof(Key))
+    {
+        sort_impl<true>(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \overload
+    /// \brief Performs descending radix sort over key-value pairs partitioned across
+    /// threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \pre Method is enabled only if \p Value type is different than empty_type.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in, out] values - reference to an array of values provided by a thread.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    template<bool WithValues = with_values>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort_desc(Key (&keys)[ItemsPerThread],
+                   typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
+                   unsigned int begin_bit = 0,
+                   unsigned int end_bit = 8 * sizeof(Key))
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        sort_desc(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \brief Performs ascending radix sort over keys partitioned across threads in a block,
+    /// results are saved in a striped arrangement.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples radix sort is performed on a block of 128 threads, each thread provides
+    /// two \p float value, results are returned using the same array as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_radix_sort for float, block of 128 threads,
+    ///     // and two items per thread; key-only sort
+    ///     using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_rsort_float::storage_type storage;
+    ///
+    ///     float keys[2] = ...;
+    ///     // execute block radix sort (ascending)
+    ///     block_rsort_float().sort_to_striped(
+    ///         keys,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{[256, 255], ..., [4, 3], [2, 1]}}</tt>, then
+    /// then after sort they will be equal <tt>{[1, 129], [2, 130]  ..., [128, 256]}</tt>.
+    /// \endparblock
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort_to_striped(Key (&keys)[ItemsPerThread],
+                         storage_type& storage,
+                         unsigned int begin_bit = 0,
+                         unsigned int end_bit = 8 * sizeof(Key))
+    {
+        empty_type values[ItemsPerThread];
+        sort_impl<false, true>(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \overload
+    /// \brief Performs ascending radix sort over keys partitioned across threads in a block,
+    /// results are saved in a striped arrangement.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort_to_striped(Key (&keys)[ItemsPerThread],
+                         unsigned int begin_bit = 0,
+                         unsigned int end_bit = 8 * sizeof(Key))
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        sort_to_striped(keys, storage, begin_bit, end_bit);
+    }
+
+    /// \brief Performs descending radix sort over keys partitioned across threads in a block,
+    /// results are saved in a striped arrangement.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples radix sort is performed on a block of 128 threads, each thread provides
+    /// two \p float value, results are returned using the same array as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_radix_sort for float, block of 128 threads,
+    ///     // and two items per thread; key-only sort
+    ///     using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_rsort_float::storage_type storage;
+    ///
+    ///     float input[2] = ...;
+    ///     // execute block radix sort (descending)
+    ///     block_rsort_float().sort_desc_to_striped(
+    ///         input,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{[1, 2], [3, 4]  ..., [255, 256]}</tt>,
+    /// then after sort they will be equal <tt>{[256, 128], ..., [130, 2], [129, 1]}</tt>.
+    /// \endparblock
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort_desc_to_striped(Key (&keys)[ItemsPerThread],
+                              storage_type& storage,
+                              unsigned int begin_bit = 0,
+                              unsigned int end_bit = 8 * sizeof(Key))
+    {
+        empty_type values[ItemsPerThread];
+        sort_impl<true, true>(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \overload
+    /// \brief Performs descending radix sort over keys partitioned across threads in a block,
+    /// results are saved in a striped arrangement.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort_desc_to_striped(Key (&keys)[ItemsPerThread],
+                              unsigned int begin_bit = 0,
+                              unsigned int end_bit = 8 * sizeof(Key))
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        sort_desc_to_striped(keys, storage, begin_bit, end_bit);
+    }
+
+    /// \brief Performs ascending radix sort over key-value pairs partitioned across
+    /// threads in a block, results are saved in a striped arrangement.
+    ///
+    /// \pre Method is enabled only if \p Value type is different than empty_type.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in, out] values - reference to an array of values provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples radix sort is performed on a block of 4 threads, each thread provides
+    /// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
+    /// arrays as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_radix_sort for int-float pairs, block of 4
+    ///     // threads, and two items per thread
+    ///     using block_rsort_ii = rocprim::block_radix_sort<int, 4, 2, int>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_rsort_ii::storage_type storage;
+    ///
+    ///     int keys[2] = ...;
+    ///     float values[2] = ...;
+    ///     // execute block radix sort-by-key (ascending)
+    ///     block_rsort_ii().sort_to_striped(
+    ///         keys, values,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p keys across threads in a block are <tt>{[8, 7], [6, 5], [4, 3], [2, 1]}</tt> and
+    /// the \p values are <tt>{[-1, -2], [-3, -4], [-5, -6], [-7, -8]}</tt>, then after sort the
+    /// \p keys will be equal <tt>{[1, 5], [2, 6], [3, 7], [4, 8]}</tt> and the \p values will be
+    /// equal <tt>{[-8, -4], [-7, -3], [-6, -2], [-5, -1]}</tt>.
+    /// \endparblock
+    template<bool WithValues = with_values>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort_to_striped(Key (&keys)[ItemsPerThread],
+                         typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
+                         storage_type& storage,
+                         unsigned int begin_bit = 0,
+                         unsigned int end_bit = 8 * sizeof(Key))
+    {
+        sort_impl<false, true>(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \overload
+    /// \brief Performs ascending radix sort over key-value pairs partitioned across
+    /// threads in a block, results are saved in a striped arrangement.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in, out] values - reference to an array of values provided by a thread.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    template<bool WithValues = with_values>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort_to_striped(Key (&keys)[ItemsPerThread],
+                         typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
+                         unsigned int begin_bit = 0,
+                         unsigned int end_bit = 8 * sizeof(Key))
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        sort_to_striped(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \brief Performs descending radix sort over key-value pairs partitioned across
+    /// threads in a block, results are saved in a striped arrangement.
+    ///
+    /// \pre Method is enabled only if \p Value type is different than empty_type.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in, out] values - reference to an array of values provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// Non-default value not supported for floating-point key-types.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples radix sort is performed on a block of 4 threads, each thread provides
+    /// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
+    /// arrays as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_radix_sort for int-float pairs, block of 4
+    ///     // threads, and two items per thread
+    ///     using block_rsort_ii = rocprim::block_radix_sort<int, 4, 2, int>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_rsort_ii::storage_type storage;
+    ///
+    ///     int keys[2] = ...;
+    ///     float values[2] = ...;
+    ///     // execute block radix sort-by-key (descending)
+    ///     block_rsort_ii().sort_desc_to_striped(
+    ///         keys, values,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p keys across threads in a block are <tt>{[1, 2], [3, 4], [5, 6], [7, 8]}</tt> and
+    /// the \p values are <tt>{[80, 70], [60, 50], [40, 30], [20, 10]}</tt>, then after sort the
+    /// \p keys will be equal <tt>{[8, 4], [7, 3], [6, 2], [5, 1]}</tt> and the \p values will be
+    /// equal <tt>{[10, 50], [20, 60], [30, 70], [40, 80]}</tt>.
+    /// \endparblock
+    template<bool WithValues = with_values>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort_desc_to_striped(Key (&keys)[ItemsPerThread],
+                              typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
+                              storage_type& storage,
+                              unsigned int begin_bit = 0,
+                              unsigned int end_bit = 8 * sizeof(Key))
+    {
+        sort_impl<true, true>(keys, values, storage, begin_bit, end_bit);
+    }
+
+    /// \overload
+    /// \brief Performs descending radix sort over key-value pairs partitioned across
+    /// threads in a block, results are saved in a striped arrangement.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \param [in, out] keys - reference to an array of keys provided by a thread.
+    /// \param [in, out] values - reference to an array of values provided by a thread.
+    /// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
+    /// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
+    /// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
+    /// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
+    /// value: \p <tt>8 * sizeof(Key)</tt>.
+    template<bool WithValues = with_values>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort_desc_to_striped(Key (&keys)[ItemsPerThread],
+                              typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
+                              unsigned int begin_bit = 0,
+                              unsigned int end_bit = 8 * sizeof(Key))
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        sort_desc_to_striped(keys, values, storage, begin_bit, end_bit);
+    }
+
+private:
+
+    template<bool Descending, bool ToStriped = false, class SortedValue>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort_impl(Key (&keys)[ItemsPerThread],
+                   SortedValue (&values)[ItemsPerThread],
+                   storage_type& storage,
+                   unsigned int begin_bit,
+                   unsigned int end_bit)
+    {
+        using key_codec = ::rocprim::detail::radix_key_codec<Key, Descending>;
+        storage_type_& storage_ = storage.get();
+
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+
+        bit_key_type bit_keys[ItemsPerThread];
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            bit_keys[i] = key_codec::encode(keys[i]);
+        }
+
+        // Use binary digits (i.e. digits can be 0 or 1)
+        for(unsigned int bit = begin_bit; bit < end_bit; bit++)
+        {
+            unsigned int bits[ItemsPerThread];
+            for(unsigned int i = 0; i < ItemsPerThread; i++)
+            {
+                bits[i] = key_codec::extract_digit(bit_keys[i], bit, 1);
+            }
+
+            unsigned int ranks[ItemsPerThread];
+#ifdef __HIP_CPU_RT__
+            // TODO: Check if really necessary
+            // Initialize contents, as non-hipcc compilers don't unconditionally zero out allocated memory
+            std::memset(ranks, 0, ItemsPerThread * sizeof(decltype(ranks[0])));
+#endif
+            unsigned int count;
+            bit_block_scan().exclusive_scan(bits, ranks, count, storage_.bit_block_scan);
+
+            // Scatter keys to computed positions considering starting positions of their digit values
+            const unsigned int start = BlockSize * ItemsPerThread - count;
+            for(unsigned int i = 0; i < ItemsPerThread; i++)
+            {
+                // Calculate position for the first digit (0) value based on positions of the second (1)
+                ranks[i] = bits[i] != 0
+                    ? (start + ranks[i])
+                    : (flat_id * ItemsPerThread + i - ranks[i]);
+            }
+            exchange_keys(storage, bit_keys, ranks);
+            exchange_values(storage, values, ranks);
+        }
+
+        if(ToStriped)
+        {
+            to_striped_keys(storage, bit_keys);
+            to_striped_values(storage, values);
+        }
+
+        for(unsigned int i = 0; i < ItemsPerThread; i++)
+        {
+            keys[i] = key_codec::decode(bit_keys[i]);
+        }
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exchange_keys(storage_type& storage,
+                       bit_key_type (&bit_keys)[ItemsPerThread],
+                       const unsigned int (&ranks)[ItemsPerThread])
+    {
+        storage_type_& storage_ = storage.get();
+        // Synchronization is omitted here because bit_block_scan already calls it
+        bit_keys_exchange_type().scatter_to_blocked(bit_keys, bit_keys, ranks, storage_.bit_keys_exchange);
+    }
+
+    template<class SortedValue>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exchange_values(storage_type& storage,
+                         SortedValue (&values)[ItemsPerThread],
+                         const unsigned int (&ranks)[ItemsPerThread])
+    {
+        storage_type_& storage_ = storage.get();
+        ::rocprim::syncthreads(); // Storage will be reused (union), synchronization is needed
+        values_exchange_type().scatter_to_blocked(values, values, ranks, storage_.values_exchange);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exchange_values(storage_type& storage,
+                         empty_type (&values)[ItemsPerThread],
+                         const unsigned int (&ranks)[ItemsPerThread])
+    {
+        (void) storage;
+        (void) values;
+        (void) ranks;
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void to_striped_keys(storage_type& storage,
+                         bit_key_type (&bit_keys)[ItemsPerThread])
+    {
+        storage_type_& storage_ = storage.get();
+        ::rocprim::syncthreads();
+        bit_keys_exchange_type().blocked_to_striped(bit_keys, bit_keys, storage_.bit_keys_exchange);
+    }
+
+    template<class SortedValue>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void to_striped_values(storage_type& storage,
+                           SortedValue (&values)[ItemsPerThread])
+    {
+        storage_type_& storage_ = storage.get();
+        ::rocprim::syncthreads(); // Storage will be reused (union), synchronization is needed
+        values_exchange_type().blocked_to_striped(values, values, storage_.values_exchange);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void to_striped_values(storage_type& storage,
+                           empty_type * values)
+    {
+        (void) storage;
+        (void) values;
+    }
+};
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
--- a/3rdparty/cub/rocprim/block/block_reduce.hpp
+++ b/3rdparty/cub/rocprim/block/block_reduce.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
+#define ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+
+#include "detail/block_reduce_warp_reduce.hpp"
+#include "detail/block_reduce_raking_reduce.hpp"
+
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief Available algorithms for block_reduce primitive.
+enum class block_reduce_algorithm
+{
+    /// \brief A warp_reduce based algorithm.
+    using_warp_reduce,
+    /// \brief An algorithm which limits calculations to a single hardware warp.
+    raking_reduce,
+    /// \brief raking reduce that supports only commutative operators
+    raking_reduce_commutative_only,
+    /// \brief Default block_reduce algorithm.
+    default_algorithm = using_warp_reduce,
+};
+
+namespace detail
+{
+
+// Selector for block_reduce algorithm which gives block reduce implementation
+// type based on passed block_reduce_algorithm enum
+template<block_reduce_algorithm Algorithm>
+struct select_block_reduce_impl;
+
+template<>
+struct select_block_reduce_impl<block_reduce_algorithm::using_warp_reduce>
+{
+    template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+    using type = block_reduce_warp_reduce<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
+};
+
+template<>
+struct select_block_reduce_impl<block_reduce_algorithm::raking_reduce>
+{
+    template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+    using type = block_reduce_raking_reduce<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
+};
+
+template<>
+struct select_block_reduce_impl<block_reduce_algorithm::raking_reduce_commutative_only>
+{
+    template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+    using type = block_reduce_raking_reduce<T, BlockSizeX, BlockSizeY, BlockSizeZ, true>;
+};
+
+
+} // end namespace detail
+
+/// \brief The block_reduce class is a block level parallel primitive which provides methods
+/// for performing reductions operations on items partitioned across threads in a block.
+///
+/// \tparam T - the input/output type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam Algorithm - selected reduce algorithm, block_reduce_algorithm::default_algorithm by default.
+///
+/// \par Overview
+/// * Supports non-commutative reduce operators. However, a reduce operator should be
+/// associative. When used with non-associative functions the results may be non-deterministic
+/// and/or vary in precision.
+/// * Computation can more efficient when:
+///   * \p ItemsPerThread is greater than one,
+///   * \p T is an arithmetic type,
+///   * reduce operation is simple addition operator, and
+///   * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
+/// * block_reduce has two alternative implementations: \p block_reduce_algorithm::using_warp_reduce,
+///   block_reduce_algorithm::raking_reduce and block_reduce_algorithm::raking_reduce_commutative_only.
+/// * If the block sizes less than 64 only one warp reduction is used. The block reduction algorithm
+///   stores the result only in the first thread(lane_id = 0 warp_id = 0), when the block size is
+///   larger then the warp size.
+///
+/// \par Examples
+/// \parblock
+/// In the examples reduce operation is performed on block of 192 threads, each provides
+/// one \p int value, result is returned using the same variable as for input.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize warp_reduce for int and logical warp of 192 threads
+///     using block_reduce_int = rocprim::block_reduce<int, 192>;
+///     // allocate storage in shared memory
+///     __shared__ block_reduce_int::storage_type storage;
+///
+///     int value = ...;
+///     // execute reduce
+///     block_reduce_int().reduce(
+///         value, // input
+///         value, // output
+///         storage
+///     );
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    block_reduce_algorithm Algorithm = block_reduce_algorithm::default_algorithm,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_reduce
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    : private detail::select_block_reduce_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>
+#endif
+{
+    using base_type = typename detail::select_block_reduce_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    using storage_type = typename base_type::storage_type;
+
+    /// \brief Performs reduction across threads in a block.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for reduce. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] reduce_op - binary operation function object that will be used for reduce.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present min reduce operations performed on a block of 256 threads,
+    /// each provides one \p float value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 256
+    /// {
+    ///     // specialize block_reduce for float and block of 256 threads
+    ///     using block_reduce_f = rocprim::block_reduce<float, 256>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_reduce_float::storage_type storage;
+    ///
+    ///     float input = ...;
+    ///     float output;
+    ///     // execute min reduce
+    ///     block_reduce_float().reduce(
+    ///         input,
+    ///         output,
+    ///         storage,
+    ///         rocprim::minimum<float>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>, then
+    /// \p output value will be <tt>{-256}</tt>.
+    /// \endparblock
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void reduce(T input,
+                T& output,
+                storage_type& storage,
+                BinaryFunction reduce_op = BinaryFunction())
+    {
+        base_type::reduce(input, output, storage, reduce_op);
+    }
+
+    /// \overload
+    /// \brief Performs reduction across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for reduce. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] reduce_op - binary operation function object that will be used for reduce.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void reduce(T input,
+                T& output,
+                BinaryFunction reduce_op = BinaryFunction())
+    {
+        base_type::reduce(input, output, reduce_op);
+    }
+
+    /// \brief Performs reduction across threads in a block.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for reduce. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] reduce_op - binary operation function object that will be used for reduce.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present maximum reduce operations performed on a block of 128 threads,
+    /// each provides two \p long value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 128
+    /// {
+    ///     // specialize block_reduce for long and block of 128 threads
+    ///     using block_reduce_f = rocprim::block_reduce<long, 128>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_reduce_long::storage_type storage;
+    ///
+    ///     long input[2] = ...;
+    ///     long output[2];
+    ///     // execute max reduce
+    ///     block_reduce_long().reduce(
+    ///         input,
+    ///         output,
+    ///         storage,
+    ///         rocprim::maximum<long>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>, then
+    /// \p output value will be <tt>{256}</tt>.
+    /// \endparblock
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void reduce(T (&input)[ItemsPerThread],
+                T& output,
+                storage_type& storage,
+                BinaryFunction reduce_op = BinaryFunction())
+    {
+        base_type::reduce(input, output, storage, reduce_op);
+    }
+
+    /// \overload
+    /// \brief Performs reduction across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for reduce. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] reduce_op - binary operation function object that will be used for reduce.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void reduce(T (&input)[ItemsPerThread],
+                T& output,
+                BinaryFunction reduce_op = BinaryFunction())
+    {
+        base_type::reduce(input, output, reduce_op);
+    }
+
+    /// \brief Performs reduction across threads in a block.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for reduce. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] valid_items - number of items that will be reduced in the block.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] reduce_op - binary operation function object that will be used for reduce.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present min reduce operations performed on a block of 256 threads,
+    /// each provides one \p float value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 256
+    /// {
+    ///     // specialize block_reduce for float and block of 256 threads
+    ///     using block_reduce_f = rocprim::block_reduce<float, 256>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_reduce_float::storage_type storage;
+    ///
+    ///     float input = ...;
+    ///     unsigned int valid_items = 250;
+    ///     float output;
+    ///     // execute min reduce
+    ///     block_reduce_float().reduce(
+    ///         input,
+    ///         output,
+    ///         valid_items,
+    ///         storage,
+    ///         rocprim::minimum<float>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    /// \endparblock
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void reduce(T input,
+                T& output,
+                unsigned int valid_items,
+                storage_type& storage,
+                BinaryFunction reduce_op = BinaryFunction())
+    {
+        base_type::reduce(input, output, valid_items, storage, reduce_op);
+    }
+
+    /// \overload
+    /// \brief Performs reduction across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for reduce. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] valid_items - number of items that will be reduced in the block.
+    /// \param [in] reduce_op - binary operation function object that will be used for reduce.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void reduce(T input,
+                T& output,
+                unsigned int valid_items,
+                BinaryFunction reduce_op = BinaryFunction())
+    {
+        base_type::reduce(input, output, valid_items, reduce_op);
+    }
+};
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
--- a/3rdparty/cub/rocprim/block/block_scan.hpp
+++ b/3rdparty/cub/rocprim/block/block_scan.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
+#define ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+
+#include "detail/block_scan_warp_scan.hpp"
+#include "detail/block_scan_reduce_then_scan.hpp"
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief Available algorithms for block_scan primitive.
+enum class block_scan_algorithm
+{
+    /// \brief A warp_scan based algorithm.
+    using_warp_scan,
+    /// \brief An algorithm which limits calculations to a single hardware warp.
+    reduce_then_scan,
+    /// \brief Default block_scan algorithm.
+    default_algorithm = using_warp_scan,
+};
+
+namespace detail
+{
+
+// Selector for block_scan algorithm which gives block scan implementation
+// type based on passed block_scan_algorithm enum
+template<block_scan_algorithm Algorithm>
+struct select_block_scan_impl;
+
+template<>
+struct select_block_scan_impl<block_scan_algorithm::using_warp_scan>
+{
+    template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+    using type = block_scan_warp_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
+};
+
+template<>
+struct select_block_scan_impl<block_scan_algorithm::reduce_then_scan>
+{
+    template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
+    // When BlockSize is less than hardware warp size block_scan_warp_scan performs better than
+    // block_scan_reduce_then_scan by specializing for warps
+    using type = typename std::conditional<
+                    (BlockSizeX * BlockSizeY * BlockSizeZ <= ::rocprim::device_warp_size()),
+                    block_scan_warp_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>,
+                    block_scan_reduce_then_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>
+                 >::type;
+};
+
+} // end namespace detail
+
+/// \brief The block_scan class is a block level parallel primitive which provides methods
+/// for performing inclusive and exclusive scan operations of items partitioned across
+/// threads in a block.
+///
+/// \tparam T - the input/output type.
+/// \tparam BlockSizeX - the number of threads in a block's x dimension.
+/// \tparam Algorithm - selected scan algorithm, block_scan_algorithm::default_algorithm by default.
+/// \tparam BlockSizeY - the number of threads in a block's y dimension, defaults to 1.
+/// \tparam BlockSizeZ - the number of threads in a block's z dimension, defaults to 1.
+///
+/// \par Overview
+/// * Supports non-commutative scan operators. However, a scan operator should be
+/// associative. When used with non-associative functions the results may be non-deterministic
+/// and/or vary in precision.
+/// * Computation can more efficient when:
+///   * \p ItemsPerThread is greater than one,
+///   * \p T is an arithmetic type,
+///   * scan operation is simple addition operator, and
+///   * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
+/// * block_scan has two alternative implementations: \p block_scan_algorithm::using_warp_scan
+///   and block_scan_algorithm::reduce_then_scan.
+///
+/// \par Examples
+/// \parblock
+/// In the examples scan operation is performed on block of 192 threads, each provides
+/// one \p int value, result is returned using the same variable as for input.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize warp_scan for int and logical warp of 192 threads
+///     using block_scan_int = rocprim::block_scan<int, 192>;
+///     // allocate storage in shared memory
+///     __shared__ block_scan_int::storage_type storage;
+///
+///     int value = ...;
+///     // execute inclusive scan
+///     block_scan_int().inclusive_scan(
+///         value, // input
+///         value, // output
+///         storage
+///     );
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    block_scan_algorithm Algorithm = block_scan_algorithm::default_algorithm,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_scan
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    : private detail::select_block_scan_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>
+#endif
+{
+    using base_type = typename detail::select_block_scan_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    using storage_type = typename base_type::storage_type;
+
+    /// \brief Performs inclusive scan across threads in a block.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present inclusive min scan operations performed on a block of 256 threads,
+    /// each provides one \p float value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 256
+    /// {
+    ///     // specialize block_scan for float and block of 256 threads
+    ///     using block_scan_f = rocprim::block_scan<float, 256>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_float::storage_type storage;
+    ///
+    ///     float input = ...;
+    ///     float output;
+    ///     // execute inclusive min scan
+    ///     block_scan_float().inclusive_scan(
+    ///         input,
+    ///         output,
+    ///         storage,
+    ///         rocprim::minimum<float>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>, then
+    /// \p output values in will be <tt>{1, -2, -2, -4, ..., -254, -256}</tt>.
+    /// \endparblock
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void inclusive_scan(T input,
+                        T& output,
+                        storage_type& storage,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        base_type::inclusive_scan(input, output, storage, scan_op);
+    }
+
+    /// \overload
+    /// \brief Performs inclusive scan across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void inclusive_scan(T input,
+                        T& output,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        base_type::inclusive_scan(input, output, scan_op);
+    }
+
+    /// \brief Performs inclusive scan and reduction across threads in a block.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [out] reduction - result of reducing of all \p input values in a block.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present inclusive min scan operations performed on a block of 256 threads,
+    /// each provides one \p float value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 256
+    /// {
+    ///     // specialize block_scan for float and block of 256 threads
+    ///     using block_scan_f = rocprim::block_scan<float, 256>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_float::storage_type storage;
+    ///
+    ///     float input = ...;
+    ///     float output;
+    ///     float reduction;
+    ///     // execute inclusive min scan
+    ///     block_scan_float().inclusive_scan(
+    ///         input,
+    ///         output,
+    ///         reduction,
+    ///         storage,
+    ///         rocprim::minimum<float>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>, then
+    /// \p output values in will be <tt>{1, -2, -2, -4, ..., -254, -256}</tt>, and the \p reduction will
+    /// be <tt>-256</tt>.
+    /// \endparblock
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void inclusive_scan(T input,
+                        T& output,
+                        T& reduction,
+                        storage_type& storage,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        base_type::inclusive_scan(input, output, reduction, storage, scan_op);
+    }
+
+    /// \overload
+    /// \brief Performs inclusive scan and reduction across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [out] reduction - result of reducing of all \p input values in a block.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void inclusive_scan(T input,
+                        T& output,
+                        T& reduction,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        base_type::inclusive_scan(input, output, reduction, scan_op);
+    }
+
+    /// \brief Performs inclusive scan across threads in a block, and uses
+    /// \p prefix_callback_op to generate prefix value for the whole block.
+    ///
+    /// \tparam PrefixCallback - type of the unary function object used for generating
+    /// block-wide prefix value for the scan operation.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in,out] prefix_callback_op - function object for generating block prefix value.
+    /// The signature of the \p prefix_callback_op should be equivalent to the following:
+    /// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    /// The object will be called by the first warp of the block with block reduction of
+    /// \p input values as input argument. The result of the first thread will be used as the
+    /// block-wide prefix.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present inclusive prefix sum operations performed on a block of 256 threads,
+    /// each thread provides one \p int value.
+    ///
+    /// \code{.cpp}
+    ///
+    /// struct my_block_prefix
+    /// {
+    ///     int prefix;
+    ///
+    ///     __device__ my_block_prefix(int prefix) : prefix(prefix) {}
+    ///
+    ///     __device__ int operator()(int block_reduction)
+    ///     {
+    ///         int old_prefix = prefix;
+    ///         prefix = prefix + block_reduction;
+    ///         return old_prefix;
+    ///     }
+    /// };
+    ///
+    /// __global__ void example_kernel(...) // blockDim.x = 256
+    /// {
+    ///     // specialize block_scan for int and block of 256 threads
+    ///     using block_scan_f = rocprim::block_scan<int, 256>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_int::storage_type storage;
+    ///
+    ///     // init prefix functor
+    ///     my_block_prefix prefix_callback(10);
+    ///
+    ///     int input;
+    ///     int output;
+    ///     // execute inclusive prefix sum
+    ///     block_scan_int().inclusive_scan(
+    ///         input,
+    ///         output,
+    ///         storage,
+    ///         prefix_callback,
+    ///         rocprim::plus<int>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
+    /// \p output values in will be <tt>{11, 12, 13, ..., 266}</tt>, and the \p prefix will
+    /// be <tt>266</tt>.
+    /// \endparblock
+    template<
+        class PrefixCallback,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void inclusive_scan(T input,
+                        T& output,
+                        storage_type& storage,
+                        PrefixCallback& prefix_callback_op,
+                        BinaryFunction scan_op)
+    {
+        base_type::inclusive_scan(input, output, storage, prefix_callback_op, scan_op);
+    }
+
+    /// \brief Performs inclusive scan across threads in a block.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present inclusive maximum scan operations performed on a block of 128 threads,
+    /// each provides two \p long value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 128
+    /// {
+    ///     // specialize block_scan for long and block of 128 threads
+    ///     using block_scan_f = rocprim::block_scan<long, 128>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_long::storage_type storage;
+    ///
+    ///     long input[2] = ...;
+    ///     long output[2];
+    ///     // execute inclusive min scan
+    ///     block_scan_long().inclusive_scan(
+    ///         input,
+    ///         output,
+    ///         storage,
+    ///         rocprim::maximum<long>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>, then
+    /// \p output values in will be <tt>{-1, 2, 2, 4, ..., 254, 256}</tt>.
+    /// \endparblock
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void inclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        storage_type& storage,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::inclusive_scan(input[0], output[0], storage, scan_op);
+        }
+        else
+        {
+            base_type::inclusive_scan(input, output, storage, scan_op);
+        }
+    }
+
+    /// \overload
+    /// \brief Performs inclusive scan across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void inclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::inclusive_scan(input[0], output[0], scan_op);
+        }
+        else
+        {
+            base_type::inclusive_scan(input, output, scan_op);
+        }
+    }
+
+    /// \brief Performs inclusive scan and reduction across threads in a block.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [out] reduction - result of reducing of all \p input values in a block.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present inclusive maximum scan operations performed on a block of 128 threads,
+    /// each provides two \p long value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 128
+    /// {
+    ///     // specialize block_scan for long and block of 128 threads
+    ///     using block_scan_f = rocprim::block_scan<long, 128>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_long::storage_type storage;
+    ///
+    ///     long input[2] = ...;
+    ///     long output[2];
+    ///     long reduction;
+    ///     // execute inclusive min scan
+    ///     block_scan_long().inclusive_scan(
+    ///         input,
+    ///         output,
+    ///         reduction,
+    ///         storage,
+    ///         rocprim::maximum<long>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>, then
+    /// \p output values in will be <tt>{-1, 2, 2, 4, ..., 254, 256}</tt> and the \p reduction will be \p 256.
+    /// \endparblock
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void inclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        T& reduction,
+                        storage_type& storage,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::inclusive_scan(input[0], output[0], reduction, storage, scan_op);
+        }
+        else
+        {
+            base_type::inclusive_scan(input, output, reduction, storage, scan_op);
+        }
+    }
+
+    /// \overload
+    /// \brief Performs inclusive scan and reduction across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [out] reduction - result of reducing of all \p input values in a block.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void inclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        T& reduction,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::inclusive_scan(input[0], output[0], reduction, scan_op);
+        }
+        else
+        {
+            base_type::inclusive_scan(input, output, reduction, scan_op);
+        }
+    }
+
+    /// \brief Performs inclusive scan across threads in a block, and uses
+    /// \p prefix_callback_op to generate prefix value for the whole block.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam PrefixCallback - type of the unary function object used for generating
+    /// block-wide prefix value for the scan operation.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in,out] prefix_callback_op - function object for generating block prefix value.
+    /// The signature of the \p prefix_callback_op should be equivalent to the following:
+    /// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    /// The object will be called by the first warp of the block with block reduction of
+    /// \p input values as input argument. The result of the first thread will be used as the
+    /// block-wide prefix.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present inclusive prefix sum operations performed on a block of 128 threads,
+    /// each thread provides two \p int value.
+    ///
+    /// \code{.cpp}
+    ///
+    /// struct my_block_prefix
+    /// {
+    ///     int prefix;
+    ///
+    ///     __device__ my_block_prefix(int prefix) : prefix(prefix) {}
+    ///
+    ///     __device__ int operator()(int block_reduction)
+    ///     {
+    ///         int old_prefix = prefix;
+    ///         prefix = prefix + block_reduction;
+    ///         return old_prefix;
+    ///     }
+    /// };
+    ///
+    /// __global__ void example_kernel(...) // blockDim.x = 128
+    /// {
+    ///     // specialize block_scan for int and block of 128 threads
+    ///     using block_scan_f = rocprim::block_scan<int, 128>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_int::storage_type storage;
+    ///
+    ///     // init prefix functor
+    ///     my_block_prefix prefix_callback(10);
+    ///
+    ///     int input[2] = ...;
+    ///     int output[2];
+    ///     // execute inclusive prefix sum
+    ///     block_scan_int().inclusive_scan(
+    ///         input,
+    ///         output,
+    ///         storage,
+    ///         prefix_callback,
+    ///         rocprim::plus<int>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
+    /// \p output values in will be <tt>{11, 12, 13, ..., 266}</tt>, and the \p prefix will
+    /// be <tt>266</tt>.
+    /// \endparblock
+    template<
+        unsigned int ItemsPerThread,
+        class PrefixCallback,
+        class BinaryFunction
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void inclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        storage_type& storage,
+                        PrefixCallback& prefix_callback_op,
+                        BinaryFunction scan_op)
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::inclusive_scan(input[0], output[0], storage, prefix_callback_op, scan_op);
+        }
+        else
+        {
+            base_type::inclusive_scan(input, output, storage, prefix_callback_op, scan_op);
+        }
+    }
+
+    /// \brief Performs exclusive scan across threads in a block.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] init - initial value used to start the exclusive scan. Should be the same
+    /// for all threads in a block.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present exclusive min scan operations performed on a block of 256 threads,
+    /// each provides one \p float value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 256
+    /// {
+    ///     // specialize block_scan for float and block of 256 threads
+    ///     using block_scan_f = rocprim::block_scan<float, 256>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_float::storage_type storage;
+    ///
+    ///     float init = ...;
+    ///     float input = ...;
+    ///     float output;
+    ///     // execute exclusive min scan
+    ///     block_scan_float().exclusive_scan(
+    ///         input,
+    ///         output,
+    ///         init,
+    ///         storage,
+    ///         rocprim::minimum<float>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>
+    /// and \p init is \p 0, then \p output values in will be <tt>{0, 0, -2, -2, -4, ..., -254, -254}</tt>.
+    /// \endparblock
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exclusive_scan(T input,
+                        T& output,
+                        T init,
+                        storage_type& storage,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        base_type::exclusive_scan(input, output, init, storage, scan_op);
+    }
+
+    /// \overload
+    /// \brief Performs exclusive scan across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] init - initial value used to start the exclusive scan. Should be the same
+    /// for all threads in a block.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void exclusive_scan(T input,
+                        T& output,
+                        T init,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        base_type::exclusive_scan(input, output, init, scan_op);
+    }
+
+    /// \brief Performs exclusive scan and reduction across threads in a block.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] init - initial value used to start the exclusive scan. Should be the same
+    /// for all threads in a block.
+    /// \param [out] reduction - result of reducing of all \p input values in a block.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present exclusive min scan operations performed on a block of 256 threads,
+    /// each provides one \p float value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 256
+    /// {
+    ///     // specialize block_scan for float and block of 256 threads
+    ///     using block_scan_f = rocprim::block_scan<float, 256>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_float::storage_type storage;
+    ///
+    ///     float init = 0;
+    ///     float input = ...;
+    ///     float output;
+    ///     float reduction;
+    ///     // execute exclusive min scan
+    ///     block_scan_float().exclusive_scan(
+    ///         input,
+    ///         output,
+    ///         init,
+    ///         reduction,
+    ///         storage,
+    ///         rocprim::minimum<float>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>
+    /// and \p init is \p 0, then \p output values in will be <tt>{0, 0, -2, -2, -4, ..., -254, -254}</tt>
+    /// and the \p reduction will be \p -256.
+    /// \endparblock
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exclusive_scan(T input,
+                        T& output,
+                        T init,
+                        T& reduction,
+                        storage_type& storage,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        base_type::exclusive_scan(input, output, init, reduction, storage, scan_op);
+    }
+
+    /// \overload
+    /// \brief Performs exclusive scan and reduction across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] init - initial value used to start the exclusive scan. Should be the same
+    /// for all threads in a block.
+    /// \param [out] reduction - result of reducing of all \p input values in a block.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::plus<T>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void exclusive_scan(T input,
+                        T& output,
+                        T init,
+                        T& reduction,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        base_type::exclusive_scan(input, output, init, reduction, scan_op);
+    }
+
+    /// \brief Performs exclusive scan across threads in a block, and uses
+    /// \p prefix_callback_op to generate prefix value for the whole block.
+    ///
+    /// \tparam PrefixCallback - type of the unary function object used for generating
+    /// block-wide prefix value for the scan operation.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - thread input value.
+    /// \param [out] output - reference to a thread output value. May be aliased with \p input.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in,out] prefix_callback_op - function object for generating block prefix value.
+    /// The signature of the \p prefix_callback_op should be equivalent to the following:
+    /// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    /// The object will be called by the first warp of the block with block reduction of
+    /// \p input values as input argument. The result of the first thread will be used as the
+    /// block-wide prefix.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present exclusive prefix sum operations performed on a block of 256 threads,
+    /// each thread provides one \p int value.
+    ///
+    /// \code{.cpp}
+    ///
+    /// struct my_block_prefix
+    /// {
+    ///     int prefix;
+    ///
+    ///     __device__ my_block_prefix(int prefix) : prefix(prefix) {}
+    ///
+    ///     __device__ int operator()(int block_reduction)
+    ///     {
+    ///         int old_prefix = prefix;
+    ///         prefix = prefix + block_reduction;
+    ///         return old_prefix;
+    ///     }
+    /// };
+    ///
+    /// __global__ void example_kernel(...) // blockDim.x = 256
+    /// {
+    ///     // specialize block_scan for int and block of 256 threads
+    ///     using block_scan_f = rocprim::block_scan<int, 256>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_int::storage_type storage;
+    ///
+    ///     // init prefix functor
+    ///     my_block_prefix prefix_callback(10);
+    ///
+    ///     int input;
+    ///     int output;
+    ///     // execute exclusive prefix sum
+    ///     block_scan_int().exclusive_scan(
+    ///         input,
+    ///         output,
+    ///         storage,
+    ///         prefix_callback,
+    ///         rocprim::plus<int>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
+    /// \p output values in will be <tt>{10, 11, 12, 13, ..., 265}</tt>, and the \p prefix will
+    /// be <tt>266</tt>.
+    /// \endparblock
+    template<
+        class PrefixCallback,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exclusive_scan(T input,
+                        T& output,
+                        storage_type& storage,
+                        PrefixCallback& prefix_callback_op,
+                        BinaryFunction scan_op)
+    {
+        base_type::exclusive_scan(input, output, storage, prefix_callback_op, scan_op);
+    }
+
+    /// \brief Performs exclusive scan across threads in a block.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] init - initial value used to start the exclusive scan. Should be the same
+    /// for all threads in a block.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present exclusive maximum scan operations performed on a block of 128 threads,
+    /// each provides two \p long value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 128
+    /// {
+    ///     // specialize block_scan for long and block of 128 threads
+    ///     using block_scan_f = rocprim::block_scan<long, 128>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_long::storage_type storage;
+    ///
+    ///     long init = ...;
+    ///     long input[2] = ...;
+    ///     long output[2];
+    ///     // execute exclusive min scan
+    ///     block_scan_long().exclusive_scan(
+    ///         input,
+    ///         output,
+    ///         init,
+    ///         storage,
+    ///         rocprim::maximum<long>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>
+    /// and \p init is 0, then \p output values in will be <tt>{0, 0, 2, 2, 4, ..., 254, 254}</tt>.
+    /// \endparblock
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        T init,
+                        storage_type& storage,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::exclusive_scan(input[0], output[0], init, storage, scan_op);
+        }
+        else
+        {
+            base_type::exclusive_scan(input, output, init, storage, scan_op);
+        }
+    }
+
+    /// \overload
+    /// \brief Performs exclusive scan across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] init - initial value used to start the exclusive scan. Should be the same
+    /// for all threads in a block.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void exclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        T init,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::exclusive_scan(input[0], output[0], init, scan_op);
+        }
+        else
+        {
+            base_type::exclusive_scan(input, output, init, scan_op);
+        }
+    }
+
+    /// \brief Performs exclusive scan and reduction across threads in a block.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] init - initial value used to start the exclusive scan. Should be the same
+    /// for all threads in a block.
+    /// \param [out] reduction - result of reducing of all \p input values in a block.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present exclusive maximum scan operations performed on a block of 128 threads,
+    /// each provides two \p long value.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...) // blockDim.x = 128
+    /// {
+    ///     // specialize block_scan for long and block of 128 threads
+    ///     using block_scan_f = rocprim::block_scan<long, 128>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_long::storage_type storage;
+    ///
+    ///     long init = ...;
+    ///     long input[2] = ...;
+    ///     long output[2];
+    ///     long reduction;
+    ///     // execute exclusive min scan
+    ///     block_scan_long().exclusive_scan(
+    ///         input,
+    ///         output,
+    ///         init,
+    ///         reduction,
+    ///         storage,
+    ///         rocprim::maximum<long>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>
+    /// and \p init is 0, then \p output values in will be <tt>{0, 0, 2, 2, 4, ..., 254, 254}</tt>
+    /// and the \p reduction will be \p 256.
+    /// \endparblock
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        T init,
+                        T& reduction,
+                        storage_type& storage,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::exclusive_scan(input[0], output[0], init, reduction, storage, scan_op);
+        }
+        else
+        {
+            base_type::exclusive_scan(input, output, init, reduction, storage, scan_op);
+        }
+    }
+
+    /// \overload
+    /// \brief Performs exclusive scan and reduction across threads in a block.
+    ///
+    /// * This overload does not accept storage argument. Required shared memory is
+    /// allocated by the method itself.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] init - initial value used to start the exclusive scan. Should be the same
+    /// for all threads in a block.
+    /// \param [out] reduction - result of reducing of all \p input values in a block.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<
+        unsigned int ItemsPerThread,
+        class BinaryFunction = ::rocprim::plus<T>
+    >
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void exclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        T init,
+                        T& reduction,
+                        BinaryFunction scan_op = BinaryFunction())
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::exclusive_scan(input[0], output[0], init, reduction, scan_op);
+        }
+        else
+        {
+            base_type::exclusive_scan(input, output, init, reduction, scan_op);
+        }
+    }
+
+    /// \brief Performs exclusive scan across threads in a block, and uses
+    /// \p prefix_callback_op to generate prefix value for the whole block.
+    ///
+    /// \tparam ItemsPerThread - number of items in the \p input array.
+    /// \tparam PrefixCallback - type of the unary function object used for generating
+    /// block-wide prefix value for the scan operation.
+    /// \tparam BinaryFunction - type of binary function used for scan. Default type
+    /// is rocprim::plus<T>.
+    ///
+    /// \param [in] input - reference to an array containing thread input values.
+    /// \param [out] output - reference to a thread output array. May be aliased with \p input.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in,out] prefix_callback_op - function object for generating block prefix value.
+    /// The signature of the \p prefix_callback_op should be equivalent to the following:
+    /// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    /// The object will be called by the first warp of the block with block reduction of
+    /// \p input values as input argument. The result of the first thread will be used as the
+    /// block-wide prefix.
+    /// \param [in] scan_op - binary operation function object that will be used for scan.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// The examples present exclusive prefix sum operations performed on a block of 128 threads,
+    /// each thread provides two \p int value.
+    ///
+    /// \code{.cpp}
+    ///
+    /// struct my_block_prefix
+    /// {
+    ///     int prefix;
+    ///
+    ///     __device__ my_block_prefix(int prefix) : prefix(prefix) {}
+    ///
+    ///     __device__ int operator()(int block_reduction)
+    ///     {
+    ///         int old_prefix = prefix;
+    ///         prefix = prefix + block_reduction;
+    ///         return old_prefix;
+    ///     }
+    /// };
+    ///
+    /// __global__ void example_kernel(...) // blockDim.x = 128
+    /// {
+    ///     // specialize block_scan for int and block of 128 threads
+    ///     using block_scan_f = rocprim::block_scan<int, 128>;
+    ///     // allocate storage in shared memory for the block
+    ///     __shared__ block_scan_int::storage_type storage;
+    ///
+    ///     // init prefix functor
+    ///     my_block_prefix prefix_callback(10);
+    ///
+    ///     int input[2] = ...;
+    ///     int output[2];
+    ///     // execute exclusive prefix sum
+    ///     block_scan_int().exclusive_scan(
+    ///         input,
+    ///         output,
+    ///         storage,
+    ///         prefix_callback,
+    ///         rocprim::plus<int>()
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ///
+    /// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
+    /// \p output values in will be <tt>{10, 11, 12, 13, ..., 265}</tt>, and the \p prefix will
+    /// be <tt>266</tt>.
+    /// \endparblock
+    template<
+        unsigned int ItemsPerThread,
+        class PrefixCallback,
+        class BinaryFunction
+    >
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void exclusive_scan(T (&input)[ItemsPerThread],
+                        T (&output)[ItemsPerThread],
+                        storage_type& storage,
+                        PrefixCallback& prefix_callback_op,
+                        BinaryFunction scan_op)
+    {
+        if(ItemsPerThread == 1)
+        {
+            base_type::exclusive_scan(input[0], output[0], storage, prefix_callback_op, scan_op);
+        }
+        else
+        {
+            base_type::exclusive_scan(input, output, storage, prefix_callback_op, scan_op);
+        }
+    }
+};
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
--- a/3rdparty/cub/rocprim/block/block_shuffle.hpp
+++ b/3rdparty/cub/rocprim/block/block_shuffle.hpp
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
+#define ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+
+#include "detail/block_reduce_warp_reduce.hpp"
+#include "detail/block_reduce_raking_reduce.hpp"
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief The block_shuffle class is a block level parallel primitive which provides methods
+/// for shuffling data partitioned across a block
+///
+/// \tparam T - the input/output type.
+/// \tparam BlockSizeX - the number of threads in a block's x dimension, it has no defaults value.
+/// \tparam BlockSizeY - the number of threads in a block's y dimension, defaults to 1.
+/// \tparam BlockSizeZ - the number of threads in a block's z dimension, defaults to 1.
+///
+/// \par Overview
+/// It is commonplace for blocks of threads to rearrange data items between
+/// threads.  The BlockShuffle abstraction allows threads to efficiently shift items
+/// either (a) up to their successor or (b) down to their predecessor.
+/// * Computation can more efficient when:
+///   * \p ItemsPerThread is greater than one,
+///   * \p T is an arithmetic type,
+///   * the number of threads in the block is a multiple of the hardware warp size (see rocprim::warp_size()).
+///
+/// \par Examples
+/// \parblock
+/// In the examples shuffle operation is performed on block of 192 threads, each provides
+/// one \p int value, result is returned using the same variable as for input.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize block__shuffle_int for int and logical warp of 192 threads
+///     using block__shuffle_int = rocprim::block_shuffle<int, 192>;
+///     // allocate storage in shared memory
+///     __shared__ block_shuffle::storage_type storage;
+///
+///     int value = ...;
+///     // execute block shuffle
+///     block__shuffle_int().inclusive_up(
+///         value, // input
+///         value, // output
+///         storage
+///     );
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1>
+class block_shuffle
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+
+    // Struct used for creating a raw_storage object for this primitive's temporary storage.
+    struct storage_type_
+    {
+        T prev[BlockSize];
+        T next[BlockSize];
+    };
+
+public:
+
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+        using storage_type = detail::raw_storage<storage_type_>;
+    #else
+        using storage_type = storage_type_; // only for Doxygen
+    #endif
+
+    /// \brief Shuffles data across threads in a block, offseted by the distance value.
+    ///
+    /// \par A thread with  threadId i receives data from a thread with threadIdx (i-distance), whre distance may be a negative value.
+    /// allocated by the method itself.
+    /// \par Any shuffle operation with invalid input or output threadIds are not carried out, i.e. threadId < 0 || threadId >= BlockSize.
+    ///
+    /// \param [in] input - input data to be shuffled to another thread.
+    /// \param [out] output - reference to a output value, that receives data from another thread
+    /// \param [in] distance - The input threadId + distance = output threadId.
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block__shuffle_int for int and logical warp of 192 threads
+    ///     using block__shuffle_int = rocprim::block_shuffle<int, 192>;
+    ///
+    ///     int value = ...;
+    ///     // execute block shuffle
+    ///     block__shuffle_int().offset(
+    ///         value, // input
+    ///         value  // output
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void offset(T input,
+                T& output,
+                int distance = 1)
+    {
+        offset(
+            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+            input, output, distance
+        );
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void offset(const size_t& flat_id,
+                T input,
+                T& output,
+                int distance)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        offset(flat_id, input, output, distance, storage);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void offset(const size_t& flat_id,
+                T input,
+                T& output,
+                int distance,
+                storage_type& storage)
+    {
+        storage_type_& storage_ = storage.get();
+        storage_.prev[flat_id] = input;
+
+        ::rocprim::syncthreads();
+
+        const int offset_tid = static_cast<int>(flat_id) + distance;
+        if ((offset_tid >= 0) && (offset_tid < (int)BlockSize))
+        {
+            output = storage_.prev[static_cast<size_t>(offset_tid)];
+        }
+    }
+
+    /// \brief Shuffles data across threads in a block, offseted by the distance value.
+    ///
+    /// \par A thread with  threadId i receives data from a thread with threadIdx (i-distance)%BlockSize, whre distance may be a negative value.
+    /// allocated by the method itself.
+    /// \par Data is rotated around the block, using (input_threadId + distance) modulous BlockSize to ensure valid threadIds.
+    ///
+    /// \param [in] input - input data to be shuffled to another thread.
+    /// \param [out] output - reference to a output value, that receives data from another thread
+    /// \param [in] distance - The input threadId + distance = output threadId.
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block__shuffle_int for int and logical warp of 192 threads
+    ///     using block__shuffle_int = rocprim::block_shuffle<int, 192>;
+    ///
+    ///     int value = ...;
+    ///     // execute block shuffle
+    ///     block__shuffle_int().rotate(
+    ///         value, // input
+    ///         value  // output
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void rotate(T input,
+                T& output,
+                unsigned int distance = 1)
+    {
+        rotate(
+            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+            input, output, distance
+        );
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void rotate(const size_t& flat_id,
+                T input,
+                T& output,
+                unsigned int distance)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        rotate(flat_id, input, output, distance, storage);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void rotate(const size_t& flat_id,
+                T input,
+                T& output,
+                unsigned int distance,
+                storage_type& storage)
+    {
+        storage_type_& storage_ = storage.get();
+        storage_.prev[flat_id] = input;
+
+        ::rocprim::syncthreads();
+
+        unsigned int offset = threadIdx.x + distance;
+        if (offset >= BlockSize)
+            offset -= BlockSize;
+
+        output = storage_.prev[offset];
+    }
+
+
+    /// \brief The thread block rotates a blocked arrange of input items,
+    /// shifting it up by one item
+    ///
+    /// \param [in]  input -  The calling thread's input items
+    /// \param [out] prev  -  The corresponding predecessor items (may be aliased to \p input).
+    /// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block__shuffle_int for int and logical warp of 192 threads
+    ///     using block__shuffle_int = rocprim::block_shuffle<int, 192>;
+    ///
+    ///     int value = ...;
+    ///     // execute block shuffle
+    ///     block__shuffle_int().up(
+    ///         value, // input
+    ///         value  // output
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void up(T (&input)[ItemsPerThread],
+            T (&prev)[ItemsPerThread])
+    {
+        this->up(
+            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+            input, prev
+        );
+    }
+
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void up(const size_t& flat_id,
+            T (&input)[ItemsPerThread],
+            T (&prev)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        this->up(flat_id, input, prev, storage);
+    }
+
+
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void up(const size_t& flat_id,
+            T (&input)[ItemsPerThread],
+            T (&prev)[ItemsPerThread],
+            storage_type& storage)
+    {
+        storage_type_& storage_ = storage.get();
+        storage_.prev[flat_id] = input[ItemsPerThread -1];
+
+        ::rocprim::syncthreads();
+
+        ROCPRIM_UNROLL
+        for (unsigned int i = ItemsPerThread - 1; i > 0; --i)
+        {
+            prev[i] = input[i - 1];
+        }
+
+        if (flat_id > 0)
+        {
+            prev[0] = storage_.prev[flat_id - 1];
+        }
+    }
+
+
+
+    /// \brief The thread block rotates a blocked arrange of input items,
+    /// shifting it up by one item
+    ///
+    /// \param [in]  input - The calling thread's input items
+    /// \param [out] prev  - The corresponding predecessor items (may be aliased to \p input).
+    /// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    /// \param [out] block_suffix - The item \p input[ItemsPerThread-1] from
+    /// <em>thread</em><sub><tt>BlockSize-1</tt></sub>, provided to all threads
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void up(T (&input)[ItemsPerThread],
+            T (&prev)[ItemsPerThread],
+            T &block_suffix)
+    {
+        this->up(
+            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+            input, prev, block_suffix
+        );
+    }
+
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void up(const size_t& flat_id,
+            T (&input)[ItemsPerThread],
+            T (&prev)[ItemsPerThread],
+            T &block_suffix)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        this->up(flat_id, input, prev, block_suffix, storage);
+    }
+
+    template <int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void up(const size_t& flat_id,
+            T (&input)[ItemsPerThread],
+            T (&prev)[ItemsPerThread],
+            T &block_suffix,
+            storage_type& storage)
+    {
+        up(flat_id, input, prev, storage);
+
+        // Update block prefix
+        block_suffix = storage->prev[BlockSize - 1];
+    }
+
+    /// \brief The thread block rotates a blocked arrange of input items,
+    /// shifting it down by one item
+    ///
+    /// \param [in]  input -  The calling thread's input items
+    /// \param [out] next  -  The corresponding successor items (may be aliased to \p input).
+    /// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block__shuffle_int for int and logical warp of 192 threads
+    ///     using block__shuffle_int = rocprim::block_shuffle<int, 192>;
+    ///
+    ///     int value = ...;
+    ///     // execute block shuffle
+    ///     block__shuffle_int().down(
+    ///         value, // input
+    ///         value  // output
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void down(T (&input)[ItemsPerThread],
+              T (&next)[ItemsPerThread])
+    {
+        this->down(
+            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+            input, next
+        );
+    }
+
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void down(const size_t& flat_id,
+              T (&input)[ItemsPerThread],
+              T (&next)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        this->down(flat_id, input, next, storage);
+    }
+
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void down(const size_t& flat_id,
+              T (&input)[ItemsPerThread],
+              T (&next)[ItemsPerThread],
+              storage_type& storage)
+    {
+        storage_type_& storage_ = storage.get();
+        storage_.next[flat_id] = input[0];
+
+        ::rocprim::syncthreads();
+
+        ROCPRIM_UNROLL
+        for (unsigned int i = 0; i < (ItemsPerThread - 1); ++i)
+        {
+          next[i] = input[i + 1];
+        }
+
+        if (flat_id <(BlockSize -1))
+        {
+          next[ItemsPerThread -1] = storage_.next[flat_id + 1];
+        }
+    }
+
+    /// \brief The thread block rotates a blocked arrange of input items,
+    /// shifting it down by one item
+    ///
+    /// \param [in]  input -  The calling thread's input items
+    /// \param [out] next  -  The corresponding successor items (may be aliased to \p input).
+    /// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
+    /// \param [out] block_prefix -  The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void down(T (&input)[ItemsPerThread],
+              T (&next)[ItemsPerThread],
+              T &block_prefix)
+    {
+        this->down(
+            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+            input, next, block_prefix
+        );
+    }
+
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void down(const size_t& flat_id,
+              T (&input)[ItemsPerThread],
+              T (&next)[ItemsPerThread],
+              T &block_prefix)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        this->down(flat_id, input, next, block_prefix, storage);
+    }
+
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void down(const size_t& flat_id,
+              T (&input)[ItemsPerThread],
+              T (&next)[ItemsPerThread],
+              T &block_prefix,
+              storage_type& storage)
+    {
+        this->down(flat_id, input, next, storage);
+
+        // Update block prefixstorage_->
+        block_prefix = storage->next[0];
+    }
+};
+
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
--- a/3rdparty/cub/rocprim/block/block_sort.hpp
+++ b/3rdparty/cub/rocprim/block/block_sort.hpp
+// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_SORT_HPP_
+#define ROCPRIM_BLOCK_BLOCK_SORT_HPP_
+
+#include <type_traits>
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+
+#include "detail/block_sort_bitonic.hpp"
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief Available algorithms for block_sort primitive.
+enum class block_sort_algorithm
+{
+    /// \brief A bitonic sort based algorithm.
+    bitonic_sort,
+    /// \brief Default block_sort algorithm.
+    default_algorithm = bitonic_sort,
+};
+
+namespace detail
+{
+
+// Selector for block_sort algorithm which gives block sort implementation
+// type based on passed block_sort_algorithm enum
+template<block_sort_algorithm Algorithm>
+struct select_block_sort_impl;
+
+template<>
+struct select_block_sort_impl<block_sort_algorithm::bitonic_sort>
+{
+    template <class Key,
+              unsigned int BlockSizeX,
+              unsigned int BlockSizeY,
+              unsigned int BlockSizeZ,
+              unsigned int ItemsPerThread,
+              class Value>
+    using type = block_sort_bitonic<Key, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Value>;
+};
+
+} // end namespace detail
+
+/// \brief The block_sort class is a block level parallel primitive which provides
+/// methods sorting items (keys or key-value pairs) partitioned across threads in a block
+/// using comparison-based sort algorithm.
+///
+/// \tparam Key - the key type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam ItemsPerThread - number of items processed by each thread.
+/// The total range will be BlockSize * ItemsPerThread long
+/// \tparam Value - the value type. Default type empty_type indicates
+/// a keys-only sort.
+/// \tparam Algorithm - selected sort algorithm, block_sort_algorithm::default_algorithm by default.
+///
+/// \par Overview
+/// * Accepts custom compare_functions for sorting across a block.
+/// * Performance depends on \p BlockSize.
+///   * It is better if \p BlockSize is a power of two.
+///   * If \p BlockSize is not a power of two, or when function with \p size overload is used
+///     odd-even sort is used instead of bitonic sort, leading to decreased performance.
+///
+/// \par Examples
+/// \parblock
+/// In the examples sort is performed on a block of 256 threads, each thread provides
+/// one \p int value, results are returned using the same variable as for input.
+///
+/// \code{.cpp}
+/// __global__ void example_kernel(...)
+/// {
+///     // specialize block_sort for int, block of 256 threads,
+///     // key-only sort
+///     using block_sort_int = rocprim::block_sort<int, 256>;
+///     // allocate storage in shared memory
+///     __shared__ block_sort_int::storage_type storage;
+///
+///     int input = ...;
+///     // execute block sort (ascending)
+///     block_sort_int().sort(
+///         input,
+///         storage
+///     );
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class Key,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread = 1,
+    class Value = empty_type,
+    block_sort_algorithm Algorithm = block_sort_algorithm::default_algorithm,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_sort
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    : private detail::select_block_sort_impl<Algorithm>::template type<Key, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Value>
+#endif
+{
+    using base_type = typename detail::select_block_sort_impl<Algorithm>::template type<Key, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Value>;
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    using storage_type = typename base_type::storage_type;
+
+    /// \brief Block sort for any data type.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for sort. Default type
+    /// is rocprim::less<T>.
+    ///
+    /// \param [in, out] thread_key - reference to a key provided by a thread.
+    /// \param [in] compare_function - comparison function object which returns true if the
+    /// first argument is is ordered before the second.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort(Key& thread_key,
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_key, compare_function);
+    }
+
+    template <class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort(Key (&thread_keys)[ItemsPerThread],
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_keys, compare_function);
+    }
+
+    /// \brief Block sort for any data type.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for sort. Default type
+    /// is rocprim::less<T>.
+    ///
+    /// \param [in, out] thread_key - reference to a key provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] compare_function - comparison function object which returns true if the
+    /// first argument is is ordered before the second.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Examples
+    /// \parblock
+    /// In the examples sort is performed on a block of 256 threads, each thread provides
+    /// one \p int value, results are returned using the same variable as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_sort for int, block of 256 threads,
+    ///     // key-only sort
+    ///     using block_sort_int = rocprim::block_sort<int, 256>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_sort_int::storage_type storage;
+    ///
+    ///     int input = ...;
+    ///     // execute block sort (ascending)
+    ///     block_sort_int().sort(
+    ///         input,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    /// \endparblock
+    template<class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort(Key& thread_key,
+              storage_type& storage,
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_key, storage, compare_function);
+    }
+
+    template <class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE 
+    void sort(Key (&thread_keys)[ItemsPerThread],
+              storage_type&  storage,
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_keys, storage, compare_function);
+    }
+
+    /// \brief Block sort by key for any data type.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for sort. Default type
+    /// is rocprim::less<T>.
+    ///
+    /// \param [in, out] thread_key - reference to a key provided by a thread.
+    /// \param [in, out] thread_value - reference to a value provided by a thread.
+    /// \param [in] compare_function - comparison function object which returns true if the
+    /// first argument is is ordered before the second.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort(Key& thread_key,
+              Value& thread_value,
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_key, thread_value, compare_function);
+    }
+
+    template<class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort(Key (&thread_keys)[ItemsPerThread],
+              Value (&thread_values)[ItemsPerThread],
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_keys, thread_values, compare_function);
+    }
+
+    /// \brief Block sort by key for any data type.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for sort. Default type
+    /// is rocprim::less<T>.
+    ///
+    /// \param [in, out] thread_key - reference to a key provided by a thread.
+    /// \param [in, out] thread_value - reference to a value provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] compare_function - comparison function object which returns true if the
+    /// first argument is is ordered before the second.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \parblock
+    /// In the examples sort is performed on a block of 256 threads, each thread provides
+    /// one \p int key and one \p int value, results are returned using the same variable as for input.
+    ///
+    /// \code{.cpp}
+    /// __global__ void example_kernel(...)
+    /// {
+    ///     // specialize block_sort for int, block of 256 threads,
+    ///     using block_sort_int = rocprim::block_sort<int, 256, int>;
+    ///     // allocate storage in shared memory
+    ///     __shared__ block_sort_int::storage_type storage;
+    ///
+    ///     int key = ...;
+    ///     int value = ...;
+    ///     // execute block sort (ascending)
+    ///     block_sort_int().sort(
+    ///         key,
+    ///         value,
+    ///         storage
+    ///     );
+    ///     ...
+    /// }
+    /// \endcode
+    /// \endparblock
+    template<class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort(Key& thread_key,
+              Value& thread_value,
+              storage_type& storage,
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_key, thread_value, storage, compare_function);
+    }
+
+    template<class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort(Key (&thread_keys)[ItemsPerThread],
+              Value (&thread_values)[ItemsPerThread],
+              storage_type& storage,
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_keys, thread_values, storage, compare_function);
+    }
+
+    /// \brief Block sort by key for any data type. If \p size is
+    /// greater than \p BlockSize, this function does nothing.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for sort. Default type
+    /// is rocprim::less<T>.
+    ///
+    /// \param [in, out] thread_key - reference to a key provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] size - custom size of block to be sorted.
+    /// \param [in] compare_function - comparison function object which returns true if the
+    /// first argument is is ordered before the second.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void sort(Key& thread_key,
+              storage_type& storage,
+              const unsigned int size,
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_key, storage, size, compare_function);
+    }
+
+    /// \brief Block sort by key for any data type. If \p size is
+    /// greater than \p BlockSize, this function does nothing.
+    ///
+    /// \tparam BinaryFunction - type of binary function used for sort. Default type
+    /// is rocprim::less<T>.
+    ///
+    /// \param [in, out] thread_key - reference to a key provided by a thread.
+    /// \param [in, out] thread_value - reference to a value provided by a thread.
+    /// \param [in] storage - reference to a temporary storage object of type storage_type.
+    /// \param [in] size - custom size of block to be sorted.
+    /// \param [in] compare_function - comparison function object which returns true if the
+    /// first argument is is ordered before the second.
+    /// The signature of the function should be equivalent to the following:
+    /// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
+    /// <tt>const &</tt>, but function object must not modify the objects passed to it.
+    template<class BinaryFunction = ::rocprim::less<Key>>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void sort(Key& thread_key,
+              Value& thread_value,
+              storage_type& storage,
+              const unsigned int size,
+              BinaryFunction compare_function = BinaryFunction())
+    {
+        base_type::sort(thread_key, thread_value, storage, size, compare_function);
+    }
+};
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_SORT_HPP_
--- a/3rdparty/cub/rocprim/block/block_store.hpp
+++ b/3rdparty/cub/rocprim/block/block_store.hpp
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_STORE_HPP_
+#define ROCPRIM_BLOCK_BLOCK_STORE_HPP_
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+
+#include "block_store_func.hpp"
+#include "block_exchange.hpp"
+
+/// \addtogroup blockmodule
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \brief \p block_store_method enumerates the methods available to store a striped arrangement
+/// of items into a blocked/striped arrangement on continuous memory
+enum class block_store_method
+{
+    /// A blocked arrangement of items is stored into a blocked arrangement on continuous
+    /// memory.
+    /// \par Performance Notes:
+    /// * Performance decreases with increasing number of items per thread (stride
+    /// between reads), because of reduced memory coalescing.
+    block_store_direct,
+
+    /// A striped arrangement of items is stored into a blocked arrangement on continuous
+    /// memory.
+    block_store_striped,
+
+    /// A blocked arrangement of items is stored into a blocked arrangement on continuous
+    /// memory using vectorization as an optimization.
+    /// \par Performance Notes:
+    /// * Performance remains high due to increased memory coalescing, provided that
+    /// vectorization requirements are fulfilled. Otherwise, performance will default
+    /// to \p block_store_direct.
+    /// \par Requirements:
+    /// * The output offset (\p block_output) must be quad-item aligned.
+    /// * The following conditions will prevent vectorization and switch to default
+    /// \p block_store_direct:
+    ///   * \p ItemsPerThread is odd.
+    ///   * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+    /// int4, etc.
+    block_store_vectorize,
+
+    /// A blocked arrangement of items is locally transposed and stored as a striped
+    /// arrangement of data on continuous memory.
+    /// \par Performance Notes:
+    /// * Performance remains high due to increased memory coalescing, regardless of the
+    /// number of items per thread.
+    /// * Performance may be better compared to \p block_store_direct and
+    /// \p block_store_vectorize due to reordering on local memory.
+    block_store_transpose,
+
+    /// A blocked arrangement of items is locally transposed and stored as a warp-striped
+    /// arrangement of data on continuous memory.
+    /// \par Requirements:
+    /// * The number of threads in the block must be a multiple of the size of hardware warp.
+    /// \par Performance Notes:
+    /// * Performance remains high due to increased memory coalescing, regardless of the
+    /// number of items per thread.
+    /// * Performance may be better compared to \p block_store_direct and
+    /// \p block_store_vectorize due to reordering on local memory.
+    block_store_warp_transpose,
+
+    /// Defaults to \p block_store_direct
+    default_method = block_store_direct
+};
+
+/// \brief The \p block_store class is a block level parallel primitive which provides methods
+/// for storing an arrangement of items into a blocked/striped arrangement on continous memory.
+///
+/// \tparam T - the output/output type.
+/// \tparam BlockSize - the number of threads in a block.
+/// \tparam ItemsPerThread - the number of items to be processed by
+/// each thread.
+/// \tparam Method - the method to store data.
+///
+/// \par Overview
+/// * The \p block_store class has a number of different methods to store data:
+///   * [block_store_direct](\ref ::block_store_method::block_store_direct)
+///   * [block_store_striped](\ref ::block_store_method::block_store_striped)
+///   * [block_store_vectorize](\ref ::block_store_method::block_store_vectorize)
+///   * [block_store_transpose](\ref ::block_store_method::block_store_transpose)
+///   * [block_store_warp_transpose](\ref ::block_store_method::block_store_warp_transpose)
+///
+/// \par Example:
+/// \parblock
+/// In the examples store operation is performed on block of 128 threads, using type
+/// \p int and 8 items per thread.
+///
+/// \code{.cpp}
+/// __global__ void kernel(int * output)
+/// {
+///     const int offset = blockIdx.x * 128 * 8;
+///     int items[8];
+///     rocprim::block_store<int, 128, 8, store_method> blockstore;
+///     blockstore.store(output + offset, items);
+///     ...
+/// }
+/// \endcode
+/// \endparblock
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    block_store_method Method = block_store_method::block_store_direct,
+    unsigned int BlockSizeY = 1,
+    unsigned int BlockSizeZ = 1
+>
+class block_store
+{
+private:
+    using storage_type_ = typename ::rocprim::detail::empty_storage_type;
+
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords \p __shared__. It can be aliased to
+    /// an externally allocated memory, or be a part of a union with other storage types
+    /// to increase shared memory reusability.
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = typename ::rocprim::detail::empty_storage_type;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+
+    /// \brief Stores an arrangement of items from across the thread block into an
+    /// arrangement on continuous memory.
+    ///
+    /// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
+    /// pointer.
+    ///
+    /// \param [out] block_output - the output iterator from the thread block to store to.
+    /// \param [in] items - array that data is read from.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread])
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_store_direct_blocked(flat_id, block_output, items);
+    }
+
+    /// \brief Stores an arrangement of items from across the thread block into an
+    /// arrangement on continuous memory, which is guarded by range \p valid.
+    ///
+    /// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
+    /// pointer.
+    ///
+    /// \param [out] block_output - the output iterator from the thread block to store to.
+    /// \param [in] items - array that data is read from.
+    /// \param [in] valid - maximum range of valid numbers to read.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_store_direct_blocked(flat_id, block_output, items, valid);
+    }
+
+    /// \brief Stores an arrangement of items from across the thread block into an
+    /// arrangement on continuous memory, using temporary storage.
+    ///
+    /// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
+    /// pointer.
+    ///
+    /// \param [out] block_output - the output iterator from the thread block to store to.
+    /// \param [in] items - array that data is read from.
+    /// \param [in] storage - temporary storage for outputs.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void kernel(...)
+    /// {
+    ///     int items[8];
+    ///     using block_store_int = rocprim::block_store<int, 128, 8>;
+    ///     block_store_int bstore;
+    ///     __shared__ typename block_store_int::storage_type storage;
+    ///     bstore.store(..., items, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               storage_type& storage)
+    {
+        (void) storage;
+        store(block_output, items);
+    }
+
+    /// \brief Stores an arrangement of items from across the thread block into an
+    /// arrangement on continuous memory, which is guarded by range \p valid,
+    /// using temporary storage
+    ///
+    /// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
+    /// pointer.
+    ///
+    /// \param [out] block_output - the output iterator from the thread block to store to.
+    /// \param [in] items - array that data is read from.
+    /// \param [in] valid - maximum range of valid numbers to read.
+    /// \param [in] storage - temporary storage for outputs.
+    ///
+    /// \par Overview
+    /// * The type \p T must be such that an object of type \p InputIterator
+    /// can be dereferenced and then implicitly converted to \p T.
+    ///
+    /// \par Storage reusage
+    /// Synchronization barrier should be placed before \p storage is reused
+    /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
+    ///
+    /// \par Example.
+    /// \code{.cpp}
+    /// __global__ void kernel(...)
+    /// {
+    ///     int items[8];
+    ///     using block_store_int = rocprim::block_store<int, 128, 8>;
+    ///     block_store_int bstore;
+    ///     __shared__ typename block_store_int::storage_type storage;
+    ///     bstore.store(..., items, valid, storage);
+    ///     ...
+    /// }
+    /// \endcode
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid,
+               storage_type& storage)
+    {
+        (void) storage;
+        store(block_output, items, valid);
+    }
+};
+
+/// @}
+// end of group blockmodule
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+    >
+class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_striped, BlockSizeY, BlockSizeZ>
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+private:
+    using storage_type_ = typename ::rocprim::detail::empty_storage_type;
+
+public:
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = typename ::rocprim::detail::empty_storage_type;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE inline
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread])
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_store_direct_striped<BlockSize>(flat_id, block_output, items);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE inline
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE inline
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               storage_type& storage)
+    {
+        (void) storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_store_direct_striped<BlockSize>(flat_id, block_output, items);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE inline
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid,
+               storage_type& storage)
+    {
+        (void) storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
+    }
+};
+
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+>
+class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_vectorize, BlockSizeY, BlockSizeZ>
+{
+private:
+    using storage_type_ = typename ::rocprim::detail::empty_storage_type;
+
+public:
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = typename ::rocprim::detail::empty_storage_type;
+    #else
+    using storage_type = storage_type_; // only for Doxygen
+    #endif
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(T* block_output,
+               T (&_items)[ItemsPerThread])
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_store_direct_blocked_vectorized(flat_id, block_output, _items);
+    }
+
+    template<class OutputIterator, class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               U (&items)[ItemsPerThread])
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_store_direct_blocked(flat_id, block_output, items);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_store_direct_blocked(flat_id, block_output, items, valid);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(T* block_output,
+               T (&items)[ItemsPerThread],
+               storage_type& storage)
+    {
+        (void) storage;
+        store(block_output, items);
+    }
+
+    template<class OutputIterator, class U>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               U (&items)[ItemsPerThread],
+               storage_type& storage)
+    {
+        (void) storage;
+        store(block_output, items);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid,
+               storage_type& storage)
+    {
+        (void) storage;
+        store(block_output, items, valid);
+    }
+};
+
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+>
+class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_transpose, BlockSizeY, BlockSizeZ>
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+private:
+    using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;
+
+public:
+    using storage_type = typename block_exchange_type::storage_type;
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_exchange_type().blocked_to_striped(items, items, storage);
+        block_store_direct_striped<BlockSize>(flat_id, block_output, items);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_exchange_type().blocked_to_striped(items, items, storage);
+        block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_exchange_type().blocked_to_striped(items, items, storage);
+        block_store_direct_striped<BlockSize>(flat_id, block_output, items);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid,
+               storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_exchange_type().blocked_to_striped(items, items, storage);
+        block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
+    }
+};
+
+template<
+    class T,
+    unsigned int BlockSizeX,
+    unsigned int ItemsPerThread,
+    unsigned int BlockSizeY,
+    unsigned int BlockSizeZ
+>
+class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_warp_transpose, BlockSizeY, BlockSizeZ>
+{
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+private:
+    using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;
+
+public:
+    static_assert(BlockSize % ::rocprim::device_warp_size() == 0,
+                 "BlockSize must be a multiple of hardware warpsize");
+
+    using storage_type = typename block_exchange_type::storage_type;
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread])
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_exchange_type().blocked_to_warp_striped(items, items, storage);
+        block_store_direct_warp_striped(flat_id, block_output, items);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid)
+    {
+        ROCPRIM_SHARED_MEMORY storage_type storage;
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_exchange_type().blocked_to_warp_striped(items, items, storage);
+        block_store_direct_warp_striped(flat_id, block_output, items, valid);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_exchange_type().blocked_to_warp_striped(items, items, storage);
+        block_store_direct_warp_striped(flat_id, block_output, items);
+    }
+
+    template<class OutputIterator>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+    void store(OutputIterator block_output,
+               T (&items)[ItemsPerThread],
+               unsigned int valid,
+               storage_type& storage)
+    {
+        const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+        block_exchange_type().blocked_to_warp_striped(items, items, storage);
+        block_store_direct_warp_striped(flat_id, block_output, items, valid);
+    }
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_BLOCK_BLOCK_STORE_HPP_
--- a/3rdparty/cub/rocprim/block/block_store_func.hpp
+++ b/3rdparty/cub/rocprim/block/block_store_func.hpp
+// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
+#define ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
+
+#include "../config.hpp"
+#include "../detail/various.hpp"
+
+#include "../intrinsics.hpp"
+#include "../functional.hpp"
+#include "../types.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/// \addtogroup blockmodule
+/// @{
+
+/// \brief Stores a blocked arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output - the input iterator from the thread block to store to
+/// \param items - array that data is stored to thread block
+template<
+    class OutputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_store_direct_blocked(unsigned int flat_id,
+                                OutputIterator block_output,
+                                T (&items)[ItemsPerThread])
+{
+    static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
+                  "The type T must be such that an object of type OutputIterator "
+                  "can be dereferenced and assigned a value of type T.");
+
+    unsigned int offset = flat_id * ItemsPerThread;
+    OutputIterator thread_iter = block_output + offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        thread_iter[item] = items[item];
+    }
+}
+
+/// \brief Stores a blocked arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output - the input iterator from the thread block to store to
+/// \param items - array that data is stored to thread block
+/// \param valid - maximum range of valid numbers to store
+template<
+    class OutputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_store_direct_blocked(unsigned int flat_id,
+                                OutputIterator block_output,
+                                T (&items)[ItemsPerThread],
+                                unsigned int valid)
+{
+    static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
+                  "The type T must be such that an object of type OutputIterator "
+                  "can be dereferenced and assigned a value of type T.");
+
+    unsigned int offset = flat_id * ItemsPerThread;
+    OutputIterator thread_iter = block_output + offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        if (item + offset < valid)
+        {
+            thread_iter[item] = items[item];
+        }
+    }
+}
+
+/// \brief Stores a blocked arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory.
+///
+/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// The input offset (\p block_output + offset) must be quad-item aligned.
+///
+/// The following conditions will prevent vectorization and switch to default
+/// block_load_direct_blocked:
+/// * \p ItemsPerThread is odd.
+/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
+/// int4, etc.
+///
+/// \tparam T - [inferred] the output data type
+/// \tparam U - [inferred] the input data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// The type \p U must be such that it can be implicitly converted to \p T.
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output - the input iterator from the thread block to load from
+/// \param items - array that data is loaded to
+template<
+    class T,
+    class U,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto
+block_store_direct_blocked_vectorized(unsigned int flat_id,
+                                      T* block_output,
+                                      U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type
+{
+    static_assert(std::is_convertible<U, T>::value,
+                  "The type U must be such that it can be implicitly converted to T.");
+
+    typedef typename detail::match_vector_type<T, ItemsPerThread>::type vector_type;
+    constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type);
+    vector_type *vectors_ptr = reinterpret_cast<vector_type*>(const_cast<T*>(block_output));
+
+    vector_type raw_vector_items[vectors_per_thread];
+    T *raw_items = reinterpret_cast<T*>(raw_vector_items);
+
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        raw_items[item] = items[item];
+    }
+
+    block_store_direct_blocked(flat_id, vectors_ptr, raw_vector_items);
+}
+
+template<
+    class T,
+    class U,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+auto
+block_store_direct_blocked_vectorized(unsigned int flat_id,
+                                      T* block_output,
+                                      U (&items)[ItemsPerThread]) -> typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>::value>::type
+{
+    block_store_direct_blocked(flat_id, block_output, items);
+}
+
+/// \brief Stores a striped arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory.
+///
+/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// \tparam BlockSize - the number of threads in a block
+/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output - the input iterator from the thread block to store to
+/// \param items - array that data is stored to thread block
+template<
+    unsigned int BlockSize,
+    class OutputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_store_direct_striped(unsigned int flat_id,
+                                OutputIterator block_output,
+                                T (&items)[ItemsPerThread])
+{
+    static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
+                  "The type T must be such that an object of type OutputIterator "
+                  "can be dereferenced and assigned a value of type T.");
+
+    OutputIterator thread_iter = block_output + flat_id;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+         thread_iter[item * BlockSize] = items[item];
+    }
+}
+
+/// \brief Stores a striped arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
+///
+/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// \tparam BlockSize - the number of threads in a block
+/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output - the input iterator from the thread block to store to
+/// \param items - array that data is stored to thread block
+/// \param valid - maximum range of valid numbers to store
+template<
+    unsigned int BlockSize,
+    class OutputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_store_direct_striped(unsigned int flat_id,
+                                OutputIterator block_output,
+                                T (&items)[ItemsPerThread],
+                                unsigned int valid)
+{
+    static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
+                  "The type T must be such that an object of type OutputIterator "
+                  "can be dereferenced and assigned a value of type T.");
+
+    OutputIterator thread_iter = block_output + flat_id;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        unsigned int offset = item * BlockSize;
+        if (flat_id + offset < valid)
+        {
+             thread_iter[offset] = items[item];
+        }
+    }
+}
+
+/// \brief Stores a warp-striped arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory.
+///
+/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// * The number of threads in the block must be a multiple of \p WarpSize.
+/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
+/// * \p WarpSize must be a power of two and equal or less than the size of
+///   hardware warp.
+/// * Using \p WarpSize smaller than hardware warpsize could result in lower
+///   performance.
+///
+/// \tparam WarpSize - [optional] the number of threads in a warp
+/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output - the input iterator from the thread block to store to
+/// \param items - array that data is stored to thread block
+template<
+    unsigned int WarpSize = device_warp_size(),
+    class OutputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_store_direct_warp_striped(unsigned int flat_id,
+                                     OutputIterator block_output,
+                                     T (&items)[ItemsPerThread])
+{
+    static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
+                  "The type T must be such that an object of type OutputIterator "
+                  "can be dereferenced and assigned a value of type T.");
+
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
+                 "WarpSize must be a power of two and equal or less"
+                 "than the size of hardware warp.");
+    unsigned int thread_id = detail::logical_lane_id<WarpSize>();
+    unsigned int warp_id = flat_id / WarpSize;
+    unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
+
+    OutputIterator thread_iter = block_output + thread_id + warp_offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        thread_iter[item * WarpSize] = items[item];
+    }
+}
+
+/// \brief Stores a warp-striped arrangement of items from across the thread block
+/// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
+///
+/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
+/// across a thread block. Each thread uses a \p flat_id to store a range of
+/// \p ItemsPerThread \p items to the thread block.
+///
+/// * The number of threads in the block must be a multiple of \p WarpSize.
+/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
+/// * \p WarpSize must be a power of two and equal or less than the size of
+///   hardware warp.
+/// * Using \p WarpSize smaller than hardware warpsize could result in lower
+///   performance.
+///
+/// \tparam WarpSize - [optional] the number of threads in a warp
+/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
+/// pointer
+/// \tparam T - [inferred] the data type
+/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
+/// each thread
+///
+/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
+/// \param block_output - the input iterator from the thread block to store to
+/// \param items - array that data is stored to thread block
+/// \param valid - maximum range of valid numbers to store
+template<
+    unsigned int WarpSize = device_warp_size(),
+    class OutputIterator,
+    class T,
+    unsigned int ItemsPerThread
+>
+ROCPRIM_DEVICE ROCPRIM_INLINE
+void block_store_direct_warp_striped(unsigned int flat_id,
+                                     OutputIterator block_output,
+                                     T (&items)[ItemsPerThread],
+                                     unsigned int valid)
+{
+    static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
+                  "The type T must be such that an object of type OutputIterator "
+                  "can be dereferenced and assigned a value of type T.");
+
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
+                 "WarpSize must be a power of two and equal or less"
+                 "than the size of hardware warp.");
+    unsigned int thread_id = detail::logical_lane_id<WarpSize>();
+    unsigned int warp_id = flat_id / WarpSize;
+    unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
+
+    OutputIterator thread_iter = block_output + thread_id + warp_offset;
+    ROCPRIM_UNROLL
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+        unsigned int offset = item * WarpSize;
+        if (warp_offset + thread_id + offset < valid)
+        {
+            thread_iter[offset] = items[item];
+        }
+    }
+}
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group blockmodule
+
+#endif // ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
--- a/3rdparty/cub/rocprim/block/detail/block_adjacent_difference_impl.hpp
+++ b/3rdparty/cub/rocprim/block/detail/block_adjacent_difference_impl.hpp
+// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_
+#define ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_
+
+#include "../../config.hpp"
+#include "../../detail/various.hpp"
+#include "../../intrinsics/thread.hpp"
+
+#include <type_traits>
+
+#include <cassert>
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+
+// Wrapping function that allows to call BinaryFunction of any of these signatures:
+// with b_index (a, b, b_index) or without it (a, b).
+// Only in the case of discontinuity (when flags_style is true) is the operator allowed to take an
+// index
+// block_discontinuity and block_adjacent difference only differ in their implementations by the
+// order the operators parameters are passed, so this method deals with this as well
+template <class T, class BinaryFunction>
+ROCPRIM_DEVICE ROCPRIM_INLINE auto apply(BinaryFunction op,
+                                         const T&       a,
+                                         const T&       b,
+                                         unsigned int   index,
+                                         bool_constant<true> /*as_flags*/,
+                                         bool_constant<false> /*reversed*/) -> decltype(op(b, a, index))
+{
+    return op(a, b, index);
+}
+
+template <class T, class BinaryFunction>
+ROCPRIM_DEVICE ROCPRIM_INLINE auto apply(BinaryFunction op,
+                                         const T&       a,
+                                         const T&       b,
+                                         unsigned int   index,
+                                         bool_constant<true> /*as_flags*/,
+                                         bool_constant<true> /*reversed*/)
+    -> decltype(op(b, a, index))
+{
+    return op(b, a, index);
+}
+
+template <typename T, typename BinaryFunction, bool AsFlags>
+ROCPRIM_DEVICE ROCPRIM_INLINE auto apply(BinaryFunction op,
+                                         const T&       a,
+                                         const T&       b,
+                                         unsigned int,
+                                         bool_constant<AsFlags> /*as_flags*/,
+                                         bool_constant<false> /*reversed*/) -> decltype(op(b, a))
+{
+    return op(a, b);
+}
+
+template <typename T, typename BinaryFunction, bool AsFlags>
+ROCPRIM_DEVICE ROCPRIM_INLINE auto apply(BinaryFunction op,
+                                         const T&       a,
+                                         const T&       b,
+                                         unsigned int,
+                                         bool_constant<AsFlags> /*as_flags*/,
+                                         bool_constant<true> /*reversed*/) -> decltype(op(b, a))
+{
+    return op(b, a);
+}
+
+template <typename T,
+          unsigned int BlockSizeX,
+          unsigned int BlockSizeY = 1,
+          unsigned int BlockSizeZ = 1>
+class block_adjacent_difference_impl
+{
+public:
+    static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
+    struct storage_type
+    {
+        T items[BlockSize];
+    };
+
+    template <bool         AsFlags,
+              bool         Reversed,
+              bool         WithTilePredecessor,
+              unsigned int ItemsPerThread,
+              typename Output,
+              typename BinaryFunction>
+    ROCPRIM_DEVICE void apply_left(const T (&input)[ItemsPerThread],
+                                   Output (&output)[ItemsPerThread],
+                                   BinaryFunction op,
+                                   const T        tile_predecessor_item,
+                                   storage_type&  storage)
+    {
+        static constexpr auto as_flags = bool_constant<AsFlags> {};
+        static constexpr auto reversed = bool_constant<Reversed> {};
+
+        const unsigned int flat_id
+            = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+
+        // Save the last item of each thread
+        storage.items[flat_id] = input[ItemsPerThread - 1];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = ItemsPerThread - 1; i > 0; --i)
+        {
+            output[i] = detail::apply(
+                op, input[i - 1], input[i], flat_id * ItemsPerThread + i, as_flags, reversed);
+        }
+        ::rocprim::syncthreads();
+
+        if ROCPRIM_IF_CONSTEXPR (WithTilePredecessor)
+        {
+            T predecessor_item = tile_predecessor_item;
+            if(flat_id != 0) {
+                predecessor_item = storage.items[flat_id - 1];
+            }
+
+            output[0] = detail::apply(
+                op, predecessor_item, input[0], flat_id * ItemsPerThread, as_flags, reversed);
+        }
+        else
+        {
+            output[0] = get_default_item(input, 0, as_flags);
+            if(flat_id != 0) {
+                output[0] = detail::apply(op,
+                                          storage.items[flat_id - 1],
+                                          input[0],
+                                          flat_id * ItemsPerThread,
+                                          as_flags,
+                                          reversed);
+            }
+        }
+    }
+
+    template <bool         AsFlags,
+              bool         Reversed,
+              bool         WithTilePredecessor,
+              unsigned int ItemsPerThread,
+              typename Output,
+              typename BinaryFunction>
+    ROCPRIM_DEVICE void apply_left_partial(const T (&input)[ItemsPerThread],
+                                           Output (&output)[ItemsPerThread],
+                                           BinaryFunction     op,
+                                           const T            tile_predecessor_item,
+                                           const unsigned int valid_items,
+                                           storage_type&      storage)
+    {
+        static constexpr auto as_flags = bool_constant<AsFlags> {};
+        static constexpr auto reversed = bool_constant<Reversed> {};
+
+        assert(valid_items <= BlockSize * ItemsPerThread);
+
+        const unsigned int flat_id
+            = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+
+        // Save the last item of each thread
+        storage.items[flat_id] = input[ItemsPerThread - 1];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = ItemsPerThread - 1; i > 0; --i)
+        {
+            const unsigned int index = flat_id * ItemsPerThread + i;
+            output[i] = get_default_item(input, i, as_flags);
+            if(index < valid_items) {
+                output[i] = detail::apply(op, input[i - 1], input[i], index, as_flags, reversed);
+            }
+        }
+        ::rocprim::syncthreads();
+
+        const unsigned int index = flat_id * ItemsPerThread;
+
+        if ROCPRIM_IF_CONSTEXPR (WithTilePredecessor)
+        {
+            T predecessor_item = tile_predecessor_item;
+            if(flat_id != 0) {
+                predecessor_item = storage.items[flat_id - 1];
+            }
+
+            output[0] = get_default_item(input, 0, as_flags);
+            if(index < valid_items)
+            {
+                output[0]
+                    = detail::apply(op, predecessor_item, input[0], index, as_flags, reversed);
+            }
+        }
+        else
+        {
+            output[0] = get_default_item(input, 0, as_flags);
+            if(flat_id != 0 && index < valid_items)
+            {
+                output[0] = detail::apply(op,
+                                          storage.items[flat_id - 1],
+                                          input[0],
+                                          flat_id * ItemsPerThread,
+                                          as_flags,
+                                          reversed);
+            }
+        }
+    }
+
+    template <bool         AsFlags,
+              bool         Reversed,
+              bool         WithTileSuccessor,
+              unsigned int ItemsPerThread,
+              typename Output,
+              typename BinaryFunction>
+    ROCPRIM_DEVICE void apply_right(const T (&input)[ItemsPerThread],
+                                    Output (&output)[ItemsPerThread],
+                                    BinaryFunction op,
+                                    const T        tile_successor_item,
+                                    storage_type&  storage)
+    {
+        static constexpr auto as_flags = bool_constant<AsFlags> {};
+        static constexpr auto reversed = bool_constant<Reversed> {};
+
+        const unsigned int flat_id
+            = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+
+        // Save the first item of each thread
+        storage.items[flat_id] = input[0];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread - 1; ++i)
+        {
+            output[i] = detail::apply(
+                op, input[i], input[i + 1], flat_id * ItemsPerThread + i + 1, as_flags, reversed);
+        }
+        ::rocprim::syncthreads();
+
+        if ROCPRIM_IF_CONSTEXPR (WithTileSuccessor)
+        {
+            T successor_item = tile_successor_item;
+            if(flat_id != BlockSize - 1) {
+                successor_item = storage.items[flat_id + 1];
+            }
+
+            output[ItemsPerThread - 1] = detail::apply(op,
+                                                       input[ItemsPerThread - 1],
+                                                       successor_item,
+                                                       flat_id * ItemsPerThread + ItemsPerThread,
+                                                       as_flags,
+                                                       reversed);
+        }
+        else
+        {
+            output[ItemsPerThread - 1] = get_default_item(input, ItemsPerThread - 1, as_flags);
+            if(flat_id != BlockSize - 1) {
+                output[ItemsPerThread - 1]
+                    = detail::apply(op,
+                                    input[ItemsPerThread - 1],
+                                    storage.items[flat_id + 1],
+                                    flat_id * ItemsPerThread + ItemsPerThread,
+                                    as_flags,
+                                    reversed);
+            }
+        }
+    }
+    template <bool         AsFlags,
+              bool         Reversed,
+              unsigned int ItemsPerThread,
+              typename Output,
+              typename BinaryFunction>
+    ROCPRIM_DEVICE void apply_right_partial(const T (&input)[ItemsPerThread],
+                                            Output (&output)[ItemsPerThread],
+                                            BinaryFunction     op,
+                                            const unsigned int valid_items,
+                                            storage_type&      storage)
+    {
+        static constexpr auto as_flags = bool_constant<AsFlags> {};
+        static constexpr auto reversed = bool_constant<Reversed> {};
+
+        assert(valid_items <= BlockSize * ItemsPerThread);
+
+        const unsigned int flat_id
+            = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
+
+        // Save the first item of each thread
+        storage.items[flat_id] = input[0];
+
+        ROCPRIM_UNROLL
+        for(unsigned int i = 0; i < ItemsPerThread - 1; ++i)
+        {
+            const unsigned int index = flat_id * ItemsPerThread + i + 1;
+            output[i] = get_default_item(input, i, as_flags);
+            if(index < valid_items)
+            {
+                output[i] = detail::apply(op, input[i], input[i + 1], index, as_flags, reversed);
+            }
+        }
+        ::rocprim::syncthreads();
+
+        output[ItemsPerThread - 1] = get_default_item(input, ItemsPerThread - 1, as_flags);
+
+        const unsigned int next_thread_index = flat_id * ItemsPerThread + ItemsPerThread;
+        if(next_thread_index < valid_items)
+        {
+            output[ItemsPerThread - 1] = detail::apply(op,
+                                                       input[ItemsPerThread - 1],
+                                                       storage.items[flat_id + 1],
+                                                       next_thread_index,
+                                                       as_flags,
+                                                       reversed);
+        }
+    }
+
+private:
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE int get_default_item(const T (&)[ItemsPerThread],
+                                        unsigned int /*index*/,
+                                        bool_constant<true> /*as_flags*/)
+    {
+        return 1;
+    }
+
+    template <unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE T get_default_item(const T (&input)[ItemsPerThread],
+                                      const unsigned int index,
+                                      bool_constant<false> /*as_flags*/)
+    {
+        return input[index];
+    }
+};
+
+} // namespace detail
+
+END_ROCPRIM_NAMESPACE
+
+#endif // ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_