Commit f8a481f8 authored by zhouxiang's avatar zhouxiang
Browse files

添加dtk中的cub头文件

parent 7b7c64c5
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/constant_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template<
typename ValueType,
typename OffsetT = std::ptrdiff_t
>
using ConstantInputIterator = ::rocprim::constant_iterator<ValueType, OffsetT>;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/counting_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template<
typename ValueType,
typename OffsetT = std::ptrdiff_t
>
using CountingInputIterator = ::rocprim::counting_iterator<ValueType, OffsetT>;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
BEGIN_HIPCUB_NAMESPACE
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
/**
* \addtogroup UtilIterator
* @{
*/
/**
* \brief A discard iterator
*/
template <typename OffsetT = ptrdiff_t>
class DiscardOutputIterator
{
public:
// Required iterator traits
typedef DiscardOutputIterator self_type; ///< My own type
typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
typedef void value_type; ///< The type of the element the iterator can point to
typedef void pointer; ///< The type of a pointer to an element the iterator can point to
typedef void reference; ///< The type of a reference to an element the iterator can point to
#if (THRUST_VERSION >= 100700)
// Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
typedef typename thrust::detail::iterator_facade_category<
thrust::any_system_tag,
thrust::random_access_traversal_tag,
value_type,
reference
>::type iterator_category; ///< The iterator category
#else
typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
#endif // THRUST_VERSION
private:
OffsetT offset;
public:
/// Constructor
__host__ __device__ __forceinline__ DiscardOutputIterator(
OffsetT offset = 0) ///< Base offset
:
offset(offset)
{}
/**
* @typedef self_type
* @brief Postfix increment
*/
__host__ __device__ __forceinline__ self_type operator++(int)
{
self_type retval = *this;
offset++;
return retval;
}
/**
* @typedef self_type
* @brief Postfix increment
*/
__host__ __device__ __forceinline__ self_type operator++()
{
offset++;
return *this;
}
/**
* @typedef self_type
* @brief Indirection
*/
__host__ __device__ __forceinline__ self_type& operator*()
{
// return self reference, which can be assigned to anything
return *this;
}
/**
* @typedef self_type
* @brief Addition
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator+(Distance n) const
{
self_type retval(offset + n);
return retval;
}
/**
* @typedef self_type
* @brief Addition assignment
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator+=(Distance n)
{
offset += n;
return *this;
}
/**
* @typedef self_type
* @brief Subtraction assignment
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator-(Distance n) const
{
self_type retval(offset - n);
return retval;
}
/**
* @typedef self_type
* @brief Subtraction assignment
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator-=(Distance n)
{
offset -= n;
return *this;
}
/**
* @typedef self_type
* @brief Distance
*/
__host__ __device__ __forceinline__ difference_type operator-(self_type other) const
{
return offset - other.offset;
}
/**
* @typedef self_type
* @brief Array subscript
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator[](Distance)
{
// return self reference, which can be assigned to anything
return *this;
}
/// Structure dereference
__host__ __device__ __forceinline__ pointer operator->()
{
return;
}
/// Assignment to anything else (no-op)
template<typename T>
__host__ __device__ __forceinline__ void operator=(T const&)
{}
/// Cast to void* operator
__host__ __device__ __forceinline__ operator void*() const { return NULL; }
/**
* @typedef self_type
* @brief Equal to
*/
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (offset == rhs.offset);
}
/**
* @typedef self_type
* @brief Not equal to
*/
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (offset != rhs.offset);
}
/**
* @typedef self_type
* @brief ostream operator
*/
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "[" << itr.offset << "]";
return os;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
#include <cub/rocprim/iterator/texture_cache_iterator.hpp>
BEGIN_HIPCUB_NAMESPACE
template<
typename T,
typename OffsetT = std::ptrdiff_t
>
class TexObjInputIterator : public ::rocprim::texture_cache_iterator<T, OffsetT>
{
public:
template<class Qualified>
inline
cudaError_t BindTexture(Qualified* ptr,
size_t bytes = size_t(-1),
size_t texture_offset = 0)
{
return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::bind_texture(ptr, bytes, texture_offset);
}
inline cudaError_t UnbindTexture()
{
return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::unbind_texture();
}
HIPCUB_HOST_DEVICE inline
~TexObjInputIterator() = default;
HIPCUB_HOST_DEVICE inline
TexObjInputIterator() : ::rocprim::texture_cache_iterator<T, OffsetT>()
{
}
HIPCUB_HOST_DEVICE inline
TexObjInputIterator(const ::rocprim::texture_cache_iterator<T, OffsetT> other)
: ::rocprim::texture_cache_iterator<T, OffsetT>(other)
{
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
#include <rocprim/iterator/texture_cache_iterator.hpp>
BEGIN_HIPCUB_NAMESPACE
template<
typename T,
int UNIQUE_ID, // Unused parameter for compatibility with original definition in cub
typename OffsetT = std::ptrdiff_t
>
class TexRefInputIterator : public ::rocprim::texture_cache_iterator<T, OffsetT>
{
public:
template<class Qualified>
inline
cudaError_t BindTexture(Qualified* ptr,
size_t bytes = size_t(-1),
size_t texture_offset = 0)
{
return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::bind_texture(ptr, bytes, texture_offset);
}
inline cudaError_t UnbindTexture()
{
return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::unbind_texture();
}
HIPCUB_HOST_DEVICE inline
~TexRefInputIterator() = default;
HIPCUB_HOST_DEVICE inline
TexRefInputIterator() : ::rocprim::texture_cache_iterator<T, OffsetT>()
{
}
HIPCUB_HOST_DEVICE inline
TexRefInputIterator(const ::rocprim::texture_cache_iterator<T, OffsetT> other)
: ::rocprim::texture_cache_iterator<T, OffsetT>(other)
{
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/transform_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template<
typename ValueType,
typename ConversionOp,
typename InputIteratorT,
typename OffsetT = std::ptrdiff_t // ignored
>
using TransformInputIterator = ::rocprim::transform_iterator<InputIteratorT, ConversionOp, ValueType>;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
#define ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
#include "detail/block_adjacent_difference_impl.hpp"
#include "../config.hpp"
#include "../detail/various.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The \p block_adjacent_difference class is a block level parallel primitive which provides
/// methods for applying binary functions for pairs of consecutive items partition across a thread
/// block.
///
/// \tparam T - the input type.
/// \tparam BlockSize - the number of threads in a block.
///
/// \par Overview
/// * There are two types of flags:
/// * Head flags.
/// * Tail flags.
/// * The above flags are used to differentiate items from their predecessors or successors.
/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
/// segmented reduction/scan.
///
/// \par Examples
/// \parblock
/// In the examples discontinuity operation is performed on block of 128 threads, using type
/// \p int.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_adjacent_difference
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
: private detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>
#endif // DOXYGEN_SHOULD_SKIP_THIS
{
using base_type = detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
static constexpr unsigned BlockSize = base_type::BlockSize;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct storage_type_
{
typename base_type::storage_type left;
typename base_type::storage_type right;
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = detail::raw_storage<storage_type_>;
#else
using storage_type = storage_type_;
#endif
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item has no reference and is always
/// flagged.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, head_flags, flag_op, input[0] /* predecessor */, storage.get().left);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
/// This overload does not take a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads(head_flags, input, flag_op, storage);
}
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item of the first thread is compared against
/// a \p tile_predecessor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, head_flags, flag_op, tile_predecessor_item, storage.get().left);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads(head_flags, tile_predecessor_item, input, flag_op, storage);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item has no reference and is always
/// flagged.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_successor = false;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, tail_flags, flag_op, input[0] /* successor */, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_tails(tail_flags, input, flag_op, storage);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item of the last thread is compared against
/// a \p tile_successor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_successor = true;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_tails(tail_flags, tile_successor_item, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
/// flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
static constexpr auto with_successor = false;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tail_flags, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the last item of the
/// last thread is compared against a \p tile_successor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
static constexpr auto with_successor = true;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first item of the
/// first thread is compared against a \p tile_predecessor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
static constexpr auto with_successor = false;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first and last items of
/// the first and last thread is compared against a \p tile_predecessor_item and
/// a \p tile_successor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_predecessor_item = 0;
/// int tile_successor_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_predecessor_item = ...
/// tile_successor_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
/// tail_flags, tile_successor_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
static constexpr auto with_successor = true;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(
head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
input, flag_op, storage
);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item.
///
/// The first item in the first thread is copied from the input then for the rest the following
/// code applies.
/// \code
/// // For each i in [1, block_size * ItemsPerThread) across threads in a block
/// output[i] = op(input[i], input[i-1]);
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param storage reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, output, op, input[0] /* predecessor */, storage.get().left);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item, with an explicit item before
/// the tile.
///
/// \code
/// // For the first item on the first thread use the tile predecessor
/// output[0] = op(input[0], tile_predecessor)
/// // For other items, i in [1, block_size * ItemsPerThread) across threads in a block
/// output[i] = op(input[i], input[i-1]);
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] tile_predecessor - the item before the tile, will be used as the input
/// of the first application of `op`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const T tile_predecessor,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, output, op, tile_predecessor, storage.get().left);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item, in a partial tile.
///
/// \code
/// output[0] = input[0]
/// // For each item i in [1, valid_items) across threads in a block
/// output[i] = op(input[i], input[i-1]);
/// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
/// output[i] = input[i]
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] valid_items - number of items in the block which are considered "valid" and will
/// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left_partial(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const unsigned int valid_items,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
base_type::template apply_left_partial<as_flags, reversed, with_predecessor>(
input, output, op, input[0] /* predecessor */, valid_items, storage.get().left);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item, in a partial tile with a
/// predecessor.
///
/// This combines subtract_left_partial() with a tile predecessor.
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] tile_predecessor - the item before the tile, will be used as the input
/// of the first application of `op`
/// \param [in] valid_items - number of items in the block which are considered "valid" and will
/// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left_partial(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const T tile_predecessor,
const unsigned int valid_items,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
base_type::template apply_left_partial<as_flags, reversed, with_predecessor>(
input, output, op, tile_predecessor, valid_items, storage.get().left);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the right item.
///
/// The last item in the last thread is copied from the input then for the rest the following
/// code applies.
/// \code
/// // For each i in [0, block_size * ItemsPerThread - 1) across threads in a block
/// output[i] = op(input[i], input[i+1]);
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = false;
static constexpr auto with_successor = false;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, output, op, input[0] /* successor */, storage.get().right);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the right item, with an explicit item after
/// the tile.
///
/// \code
/// // For each items i in [0, block_size * ItemsPerThread - 1) across threads in a block
/// output[i] = op(input[i], input[i+1]);
/// // For the last item on the last thread use the tile successor
/// output[block_size * ItemsPerThread - 1] =
/// op(input[block_size * ItemsPerThread - 1], tile_successor)
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] tile_successor - the item after the tile, will be used as the input
/// of the last application of `op`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const T tile_successor,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = false;
static constexpr auto with_successor = true;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, output, op, tile_successor, storage.get().right);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the right item, in a partial tile.
///
/// \code
/// // For each item i in [0, valid_items) across threads in a block
/// output[i] = op(input[i], input[i + 1]);
/// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
/// output[i] = input[i]
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] valid_items - number of items in the block which are considered "valid" and will
/// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right_partial(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const unsigned int valid_items,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = false;
base_type::template apply_right_partial<as_flags, reversed>(
input, output, op, valid_items, storage.get().right);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
#define ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
#include "detail/block_adjacent_difference_impl.hpp"
#include "../config.hpp"
#include "../detail/various.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The \p block_discontinuity class is a block level parallel primitive which provides
/// methods for flagging items that are discontinued within an ordered set of items across
/// threads in a block.
///
/// \tparam T - the input type.
/// \tparam BlockSize - the number of threads in a block.
///
/// \par Overview
/// * There are two types of flags:
/// * Head flags.
/// * Tail flags.
/// * The above flags are used to differentiate items from their predecessors or successors.
/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
/// segmented reduction/scan.
///
/// \par Examples
/// \parblock
/// In the examples discontinuity operation is performed on block of 128 threads, using type
/// \p int.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_discontinuity
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
: private detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>
#endif // DOXYGEN_SHOULD_SKIP_THIS
{
using base_type = detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
static constexpr unsigned BlockSize = base_type::BlockSize;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct storage_type_
{
typename base_type::storage_type left;
typename base_type::storage_type right;
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = detail::raw_storage<storage_type_>;
#else
using storage_type = storage_type_;
#endif
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item has no reference and is always
/// flagged.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = false;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, head_flags, flag_op, input[0] /* predecessor */, storage.get().left);
}
/// \overload
/// This overload does not take a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads(head_flags, input, flag_op, storage);
}
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item of the first thread is compared against
/// a \p tile_predecessor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = true;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, head_flags, flag_op, tile_predecessor_item, storage.get().left);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads(head_flags, tile_predecessor_item, input, flag_op, storage);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item has no reference and is always
/// flagged.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_successor = false;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, tail_flags, flag_op, input[0] /* successor */, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_tails(tail_flags, input, flag_op, storage);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item of the last thread is compared against
/// a \p tile_successor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_successor = true;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_tails(tail_flags, tile_successor_item, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
/// flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = false;
static constexpr auto with_successor = false;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tail_flags, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the last item of the
/// last thread is compared against a \p tile_successor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = false;
static constexpr auto with_successor = true;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first item of the
/// first thread is compared against a \p tile_predecessor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = true;
static constexpr auto with_successor = false;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first and last items of
/// the first and last thread is compared against a \p tile_predecessor_item and
/// a \p tile_successor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_predecessor_item = 0;
/// int tile_successor_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_predecessor_item = ...
/// tile_successor_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
/// tail_flags, tile_successor_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = true;
static constexpr auto with_successor = true;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(
head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
input, flag_op, storage
);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
#define ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The \p block_exchange class is a block level parallel primitive which provides
/// methods for rearranging items partitioned across threads in a block.
///
/// \tparam T - the input type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items contributed by each thread.
///
/// \par Overview
/// * The \p block_exchange class supports the following rearrangement methods:
/// * Transposing a blocked arrangement to a striped arrangement.
/// * Transposing a striped arrangement to a blocked arrangement.
/// * Transposing a blocked arrangement to a warp-striped arrangement.
/// * Transposing a warp-striped arrangement to a blocked arrangement.
/// * Scattering items to a blocked arrangement.
/// * Scattering items to a striped arrangement.
/// * Data is automatically be padded to ensure zero bank conflicts.
///
/// \par Examples
/// \parblock
/// In the examples exchange operation is performed on block of 128 threads, using type
/// \p int with 8 items per thread.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.blocked_to_striped(items, items, storage);
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_exchange
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
// Select warp size
static constexpr unsigned int warp_size =
detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
// Number of warps in block
static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size;
// Minimize LDS bank conflicts for power-of-two strides, i.e. when items accessed
// using `thread_id * ItemsPerThread` pattern where ItemsPerThread is power of two
// (all exchanges from/to blocked).
static constexpr bool has_bank_conflicts =
ItemsPerThread >= 2 && ::rocprim::detail::is_power_of_two(ItemsPerThread);
static constexpr unsigned int banks_no = ::rocprim::detail::get_lds_banks_no();
static constexpr unsigned int bank_conflicts_padding =
has_bank_conflicts ? (BlockSize * ItemsPerThread / banks_no) : 0;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct storage_type_
{
T buffer[BlockSize * ItemsPerThread + bank_conflicts_padding];
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = detail::raw_storage<storage_type_>;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
/// \brief Transposes a blocked arrangement of items to a striped arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template<class U>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void blocked_to_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
blocked_to_striped(input, output, storage);
}
/// \brief Transposes a blocked arrangement of items to a striped arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.blocked_to_striped(items, items, storage);
/// ...
/// }
/// \endcode
template<class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void blocked_to_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(flat_id * ItemsPerThread + i)] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(i * BlockSize + flat_id)];
}
}
/// \brief Transposes a striped arrangement of items to a blocked arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template<class U>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void striped_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
striped_to_blocked(input, output, storage);
}
/// \brief Transposes a striped arrangement of items to a blocked arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.striped_to_blocked(items, items, storage);
/// ...
/// }
/// \endcode
template<class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void striped_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
}
}
/// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template<class U>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
blocked_to_warp_striped(input, output, storage);
}
/// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.blocked_to_warp_striped(items, items, storage);
/// ...
/// }
/// \endcode
template<class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
storage_type& storage)
{
constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
const unsigned int lane_id = ::rocprim::lane_id();
const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
const unsigned int current_warp_size = get_current_warp_size();
const unsigned int offset = warp_id * items_per_warp;
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(offset + lane_id * ItemsPerThread + i)] = input[i];
}
::rocprim::wave_barrier();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(offset + i * current_warp_size + lane_id)];
}
}
/// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template<class U>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
warp_striped_to_blocked(input, output, storage);
}
/// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.warp_striped_to_blocked(items, items, storage);
/// ...
/// }
/// \endcode
template<class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
storage_type& storage)
{
constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
const unsigned int lane_id = ::rocprim::lane_id();
const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
const unsigned int current_warp_size = get_current_warp_size();
const unsigned int offset = warp_id * items_per_warp;
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(offset + i * current_warp_size + lane_id)] = input[i];
}
::rocprim::wave_barrier();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(offset + lane_id * ItemsPerThread + i)];
}
}
/// \brief Scatters items to a blocked arrangement based on their ranks
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void scatter_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
scatter_to_blocked(input, output, ranks, storage);
}
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void gather_from_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
gather_from_striped(input, output, ranks, storage);
}
/// \brief Scatters items to a blocked arrangement based on their ranks
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_blocked(items, items, ranks, storage);
/// ...
/// }
/// \endcode
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_INLINE
void scatter_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
storage_.buffer[index(rank)] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
}
}
template <class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_INLINE
void gather_from_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
output[i] = storage_.buffer[index(rank)];
}
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void scatter_to_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
scatter_to_striped(input, output, ranks, storage);
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_striped(items, items, ranks, storage);
/// ...
/// }
/// \endcode
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_INLINE
void scatter_to_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
storage_.buffer[rank] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[i * BlockSize + flat_id];
}
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, guarded by rank.
///
/// \par Overview
/// * Items with rank -1 are not scattered.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
scatter_to_striped_guarded(input, output, ranks, storage);
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, guarded by rank, using temporary storage.
///
/// \par Overview
/// * Items with rank -1 are not scattered.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_striped_guarded(items, items, ranks, storage);
/// ...
/// }
/// \endcode
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_INLINE
void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
if(rank >= 0)
{
storage_.buffer[rank] = input[i];
}
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[i * BlockSize + flat_id];
}
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, with a flag to denote validity.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
/// \tparam ValidFlag - [inferred] the validity flag type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
/// \param [in] is_valid - array that has flags to denote validity.
template<class U, class Offset, class ValidFlag>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
const ValidFlag (&is_valid)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
scatter_to_striped_flagged(input, output, ranks, is_valid, storage);
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, with a flag to denote validity, using temporary
/// storage.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
/// \tparam ValidFlag - [inferred] the validity flag type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
/// \param [in] is_valid - array that has flags to denote validity.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// int flags[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_striped_flagged(items, items, ranks, flags, storage);
/// ...
/// }
/// \endcode
template<class U, class Offset, class ValidFlag>
ROCPRIM_DEVICE ROCPRIM_INLINE
void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
const ValidFlag (&is_valid)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
if(is_valid[i])
{
storage_.buffer[rank] = input[i];
}
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[i * BlockSize + flat_id];
}
}
private:
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int get_current_warp_size() const
{
const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
return (warp_id == warps_no - 1)
? (BlockSize % warp_size > 0 ? BlockSize % warp_size : warp_size)
: warp_size;
}
// Change index to minimize LDS bank conflicts if necessary
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int index(unsigned int n)
{
// Move every 32-bank wide "row" (32 banks * 4 bytes) by one item
return has_bank_conflicts ? (n + n / banks_no) : n;
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
#define ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_histogram_atomic.hpp"
#include "detail/block_histogram_sort.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup blockmodule
/// @{
/// \brief Available algorithms for block_histogram primitive.
enum class block_histogram_algorithm
{
/// Atomic addition is used to update bin count directly.
/// \par Performance Notes:
/// * Performance is dependent on hardware implementation of atomic addition.
/// * Performance may decrease for non-uniform random input distributions
/// where many concurrent updates may be made to the same bin counter.
using_atomic,
/// A two-phase operation is used:-
/// * Data is sorted using radix-sort.
/// * "Runs" of same-valued keys are detected using discontinuity; run-lengths
/// are bin counts.
/// \par Performance Notes:
/// * Performance is consistent regardless of sample bin distribution.
using_sort,
/// \brief Default block_histogram algorithm.
default_algorithm = using_atomic,
};
namespace detail
{
// Selector for block_histogram algorithm which gives block histogram implementation
// type based on passed block_histogram_algorithm enum
template<block_histogram_algorithm Algorithm>
struct select_block_histogram_impl;
template<>
struct select_block_histogram_impl<block_histogram_algorithm::using_atomic>
{
template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, unsigned int ItemsPerThread, unsigned int Bins>
using type = block_histogram_atomic<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
};
template<>
struct select_block_histogram_impl<block_histogram_algorithm::using_sort>
{
template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, unsigned int ItemsPerThread, unsigned int Bins>
using type = block_histogram_sort<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
};
} // end namespace detail
/// \brief The block_histogram class is a block level parallel primitive which provides methods
/// for constructing block-wide histograms from items partitioned across threads in a block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items to be processed by each thread.
/// \tparam Bins - the number of bins within the histogram.
/// \tparam Algorithm - selected histogram algorithm, block_histogram_algorithm::default_algorithm by default.
///
/// \par Overview
/// * block_histogram has two alternative implementations: \p block_histogram_algorithm::using_atomic
/// and block_histogram_algorithm::using_sort.
///
/// \par Examples
/// \parblock
/// In the examples histogram operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_histogram for int, logical block of 192 threads,
/// // 2 items per thread and a bin size of 192.
/// using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
/// // allocate storage in shared memory
/// __shared__ block_histogram_int::storage_type storage;
/// __shared__ int hist[192];
///
/// int value[2];
/// ...
/// // execute histogram
/// block_histogram_int().histogram(
/// value, // input
/// hist, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int Bins,
block_histogram_algorithm Algorithm = block_histogram_algorithm::default_algorithm,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_histogram
#ifndef DOXYGEN_SHOULD_SKIP_THIS
: private detail::select_block_histogram_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>
#endif
{
using base_type = typename detail::select_block_histogram_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
using storage_type = typename base_type::storage_type;
/// \brief Initialize histogram counters to zero.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [out] hist - histogram bin count.
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_INLINE
void init_histogram(Counter hist[Bins])
{
const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
ROCPRIM_UNROLL
for(unsigned int offset = 0; offset < Bins; offset += BlockSize)
{
const unsigned int offset_tid = offset + flat_tid;
if(offset_tid < Bins)
{
hist[offset_tid] = Counter();
}
}
}
/// \brief Update an existing block-wide histogram. Each thread composites an array of
/// input elements.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples histogram operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_histogram for int, logical block of 192 threads,
/// // 2 items per thread and a bin size of 192.
/// using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
/// // allocate storage in shared memory
/// __shared__ block_histogram_int::storage_type storage;
/// __shared__ int hist[192];
///
/// int value[2];
/// ...
/// // initialize histogram
/// block_histogram_int().init_histogram(
/// hist // output
/// );
///
/// rocprim::syncthreads();
///
/// // update histogram
/// block_histogram_int().composite(
/// value, // input
/// hist, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_INLINE
void composite(T (&input)[ItemsPerThread],
Counter hist[Bins],
storage_type& storage)
{
base_type::composite(input, hist, storage);
}
/// \overload
/// \brief Update an existing block-wide histogram. Each thread composites an array of
/// input elements.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void composite(T (&input)[ItemsPerThread],
Counter hist[Bins])
{
base_type::composite(input, hist);
}
/// \brief Construct a new block-wide histogram. Each thread contributes an array of
/// input elements.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples histogram operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_histogram for int, logical block of 192 threads,
/// // 2 items per thread and a bin size of 192.
/// using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
/// // allocate storage in shared memory
/// __shared__ block_histogram_int::storage_type storage;
/// __shared__ int hist[192];
///
/// int value[2];
/// ...
/// // execute histogram
/// block_histogram_int().histogram(
/// value, // input
/// hist, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_INLINE
void histogram(T (&input)[ItemsPerThread],
Counter hist[Bins],
storage_type& storage)
{
init_histogram(hist);
::rocprim::syncthreads();
composite(input, hist, storage);
}
/// \overload
/// \brief Construct a new block-wide histogram. Each thread contributes an array of
/// input elements.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void histogram(T (&input)[ItemsPerThread],
Counter hist[Bins])
{
init_histogram(hist);
::rocprim::syncthreads();
composite(input, hist);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
#define ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "block_load_func.hpp"
#include "block_exchange.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief \p block_load_method enumerates the methods available to load data
/// from continuous memory into a blocked arrangement of items across the thread block
enum class block_load_method
{
/// Data from continuous memory is loaded into a blocked arrangement of items.
/// \par Performance Notes:
/// * Performance decreases with increasing number of items per thread (stride
/// between reads), because of reduced memory coalescing.
block_load_direct,
/// A striped arrangement of data is read directly from memory.
block_load_striped,
/// Data from continuous memory is loaded into a blocked arrangement of items
/// using vectorization as an optimization.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, provided that
/// vectorization requirements are fulfilled. Otherwise, performance will default
/// to \p block_load_direct.
/// \par Requirements:
/// * The input offset (\p block_input) must be quad-item aligned.
/// * The following conditions will prevent vectorization and switch to default
/// \p block_load_direct:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
block_load_vectorize,
/// A striped arrangement of data from continuous memory is locally transposed
/// into a blocked arrangement of items.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_load_direct and
/// \p block_load_vectorize due to reordering on local memory.
block_load_transpose,
/// A warp-striped arrangement of data from continuous memory is locally transposed
/// into a blocked arrangement of items.
/// \par Requirements:
/// * The number of threads in the block must be a multiple of the size of hardware warp.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_load_direct and
/// \p block_load_vectorize due to reordering on local memory.
block_load_warp_transpose,
/// Defaults to \p block_load_direct
default_method = block_load_direct
};
/// \brief The \p block_load class is a block level parallel primitive which provides methods
/// for loading data from continuous memory into a blocked arrangement of items across the thread
/// block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items to be processed by
/// each thread.
/// \tparam Method - the method to load data.
///
/// \par Overview
/// * The \p block_load class has a number of different methods to load data:
/// * [block_load_direct](\ref ::block_load_method::block_load_direct)
/// * [block_load_striped](\ref ::block_load_method::block_load_striped)
/// * [block_load_vectorize](\ref ::block_load_method::block_load_vectorize)
/// * [block_load_transpose](\ref ::block_load_method::block_load_transpose)
/// * [block_load_warp_transpose](\ref ::block_load_method::block_load_warp_transpose)
///
/// \par Example:
/// \parblock
/// In the examples load operation is performed on block of 128 threads, using type
/// \p int and 8 items per thread.
///
/// \code{.cpp}
/// __global__ void example_kernel(int * input, ...)
/// {
/// const int offset = blockIdx.x * 128 * 8;
/// int items[8];
/// rocprim::block_load<int, 128, 8, load_method> blockload;
/// blockload.load(input + offset, items);
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
block_load_method Method = block_load_method::block_load_direct,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_load
{
private:
using storage_type_ = typename ::rocprim::detail::empty_storage_type;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords \p __shared__. It can be aliased to
/// an externally allocated memory, or be a part of a union with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = typename ::rocprim::detail::empty_storage_type;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range \p valid.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items, valid);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range with a fall-back value for out-of-bound
/// elements.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
/// \tparam Default - [inferred] The data type of the default value.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
/// \param [in] out_of_bounds - default value assigned to out-of-bound items.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items, valid,
out_of_bounds);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, using temporary storage.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] storage - temporary storage for inputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// int items[8];
/// using block_load_int = rocprim::block_load<int, 128, 8>;
/// block_load_int bload;
/// __shared__ typename block_load_int::storage_type storage;
/// bload.load(..., items, storage);
/// ...
/// }
/// \endcode
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range \p valid, using temporary storage.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
/// \param [in] storage - temporary storage for inputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// int items[8];
/// using block_load_int = rocprim::block_load<int, 128, 8>;
/// block_load_int bload;
/// tile_static typename block_load_int::storage_type storage;
/// bload.load(..., items, valid, storage);
/// ...
/// }
/// \endcode
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items, valid);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range with a fall-back value for out-of-bound
/// elements, using temporary storage.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
/// \tparam Default - [inferred] The data type of the default value.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
/// \param [in] out_of_bounds - default value assigned to out-of-bound items.
/// \param [in] storage - temporary storage for inputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// int items[8];
/// using block_load_int = rocprim::block_load<int, 128, 8>;
/// block_load_int bload;
/// __shared__ typename block_load_int::storage_type storage;
/// bload.load(..., items, valid, out_of_bounds, storage);
/// ...
/// }
/// \endcode
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items, valid, out_of_bounds);
}
};
/// @}
// end of group blockmodule
#ifndef DOXYGEN_SHOULD_SKIP_THIS
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_striped, BlockSizeY, BlockSizeZ>
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
private:
using storage_type_ = typename ::rocprim::detail::empty_storage_type;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = typename ::rocprim::detail::empty_storage_type;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
template<class InputIterator>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items);
}
template<class InputIterator>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
out_of_bounds);
}
template<class InputIterator>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items);
}
template<class InputIterator>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
out_of_bounds);
}
};
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_vectorize, BlockSizeY, BlockSizeZ>
{
private:
using storage_type_ = typename ::rocprim::detail::empty_storage_type;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = typename ::rocprim::detail::empty_storage_type;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(T* block_input,
T (&_items)[ItemsPerThread])
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked_vectorized(flat_id, block_input, _items);
}
template<class InputIterator, class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
U (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items, valid);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items, valid,
out_of_bounds);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(T* block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
(void) storage;
load(block_input, items);
}
template<class InputIterator, class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
U (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items, valid);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items, valid, out_of_bounds);
}
};
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_transpose, BlockSizeY, BlockSizeZ>
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
private:
using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;
public:
using storage_type = typename block_exchange_type::storage_type;
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
out_of_bounds);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
out_of_bounds);
block_exchange_type().striped_to_blocked(items, items, storage);
}
};
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_warp_transpose, BlockSizeY, BlockSizeZ>
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
private:
using block_exchange_type = block_exchange<T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
public:
static_assert(BlockSize % ::rocprim::device_warp_size() == 0,
"BlockSize must be a multiple of hardware warpsize");
using storage_type = typename block_exchange_type::storage_type;
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items, valid);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items, valid,
out_of_bounds);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items, valid);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items, valid,
out_of_bounds);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
#define ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup blockmodule
/// @{
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template<
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_blocked(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread])
{
unsigned int offset = flat_id * ItemsPerThread;
InputIterator thread_iter = block_input + offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = thread_iter[item];
}
}
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block, which is guarded by range \p valid.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
template<
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_blocked(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
unsigned int offset = flat_id * ItemsPerThread;
InputIterator thread_iter = block_input + offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
if (item + offset < valid)
{
items[item] = thread_iter[item];
}
}
}
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block, which is guarded by range with a fall-back value
/// for out-of-bound elements.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
/// \tparam Default - [inferred] The data type of the default value
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
/// \param out_of_bounds - default value assigned to out-of-bound items
template<
class InputIterator,
class T,
unsigned int ItemsPerThread,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_blocked(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = static_cast<T>(out_of_bounds);
}
// TODO: Consider using std::fill for HIP-CPU, as uses memset() where appropriate
block_load_direct_blocked(flat_id, block_input, items, valid);
}
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// The input offset (\p block_input + offset) must be quad-item aligned.
///
/// The following conditions will prevent vectorization and switch to default
/// block_load_direct_blocked:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
///
/// \tparam T - [inferred] the input data type
/// \tparam U - [inferred] the output data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// The type \p T must be such that it can be implicitly converted to \p U.
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template<
class T,
class U,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto
block_load_direct_blocked_vectorized(unsigned int flat_id,
T* block_input,
U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type
{
typedef typename detail::match_vector_type<T, ItemsPerThread>::type vector_type;
constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type);
vector_type vector_items[vectors_per_thread];
const vector_type* vector_ptr = reinterpret_cast<const vector_type*>(block_input) +
(flat_id * vectors_per_thread);
ROCPRIM_UNROLL
for (unsigned int item = 0; item < vectors_per_thread; item++)
{
vector_items[item] = *(vector_ptr + item);
}
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = *(reinterpret_cast<T*>(vector_items) + item);
}
}
template<
class T,
class U,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto
block_load_direct_blocked_vectorized(unsigned int flat_id,
T* block_input,
U (&items)[ItemsPerThread]) -> typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>::value>::type
{
block_load_direct_blocked(flat_id, block_input, items);
}
/// \brief Loads data from continuous memory into a striped arrangement of items
/// across the thread block.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template<
unsigned int BlockSize,
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread])
{
InputIterator thread_iter = block_input + flat_id;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = thread_iter[item * BlockSize];
}
}
/// \brief Loads data from continuous memory into a striped arrangement of items
/// across the thread block, which is guarded by range \p valid.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
template<
unsigned int BlockSize,
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
InputIterator thread_iter = block_input + flat_id;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
unsigned int offset = item * BlockSize;
if (flat_id + offset < valid)
{
items[item] = thread_iter[offset];
}
}
}
/// \brief Loads data from continuous memory into a striped arrangement of items
/// across the thread block, which is guarded by range with a fall-back value
/// for out-of-bound elements.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
/// \tparam Default - [inferred] The data type of the default value
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
/// \param out_of_bounds - default value assigned to out-of-bound items
template<
unsigned int BlockSize,
class InputIterator,
class T,
unsigned int ItemsPerThread,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = out_of_bounds;
}
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
}
/// \brief Loads data from continuous memory into a warp-striped arrangement of items
/// across the thread block.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template<
unsigned int WarpSize = device_warp_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_warp_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread])
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
unsigned int thread_id = detail::logical_lane_id<WarpSize>();
unsigned int warp_id = flat_id / WarpSize;
unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
InputIterator thread_iter = block_input + thread_id + warp_offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = thread_iter[item * WarpSize];
}
}
/// \brief Loads data from continuous memory into a warp-striped arrangement of items
/// across the thread block, which is guarded by range \p valid.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
template<
unsigned int WarpSize = device_warp_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_warp_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
unsigned int thread_id = detail::logical_lane_id<WarpSize>();
unsigned int warp_id = flat_id / WarpSize;
unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
InputIterator thread_iter = block_input + thread_id + warp_offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
unsigned int offset = item * WarpSize;
if (warp_offset + thread_id + offset < valid)
{
items[item] = thread_iter[offset];
}
}
}
/// \brief Loads data from continuous memory into a warp-striped arrangement of items
/// across the thread block, which is guarded by range with a fall-back value
/// for out-of-bound elements.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
/// \tparam Default - [inferred] The data type of the default value
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
/// \param out_of_bounds - default value assigned to out-of-bound items
template<
unsigned int WarpSize = device_warp_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_warp_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = out_of_bounds;
}
block_load_direct_warp_striped<WarpSize>(flat_id, block_input, items, valid);
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
#define ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/radix_sort.hpp"
#include "../warp/detail/warp_scan_crosslane.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "block_exchange.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
/// Specialized block scan of bool (1 bit values)
/// It uses warp scan and reduce functions of bool (1 bit values) based on ballot and bit count.
/// They have much better performance (several times faster) than generic scan and reduce classes
/// because of using hardware ability to calculate which lanes have true predicate values.
template<
unsigned int BlockSizeX,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_bit_plus_scan
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
// Select warp size
static constexpr unsigned int warp_size =
detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
// Number of warps in block
static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size;
// typedef of warp_scan primitive that will be used to get prefix values for
// each warp (scanned carry-outs from warps before it)
// warp_scan_crosslane is an implementation of warp_scan that does not need storage,
// but requires logical warp size to be a power of two.
using warp_scan_prefix_type =
::rocprim::detail::warp_scan_crosslane<unsigned int, detail::next_power_of_two(warps_no)>;
public:
struct storage_type_
{
unsigned int warp_prefixes[warps_no];
// ---------- Shared memory optimisation ----------
// Since we use warp_scan_crosslane for warp scan, we don't need to allocate
// any temporary memory for it.
};
using storage_type = detail::raw_storage<storage_type_>;
template<unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_INLINE
void exclusive_scan(const unsigned int (&input)[ItemsPerThread],
unsigned int (&output)[ItemsPerThread],
unsigned int& reduction,
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
const unsigned int lane_id = ::rocprim::lane_id();
const unsigned int warp_id = ::rocprim::warp_id(flat_id);
storage_type_& storage_ = storage.get();
unsigned int warp_reduction = ::rocprim::bit_count(::rocprim::ballot(input[0]));
for(unsigned int i = 1; i < ItemsPerThread; i++)
{
warp_reduction += ::rocprim::bit_count(::rocprim::ballot(input[i]));
}
if(lane_id == 0)
{
storage_.warp_prefixes[warp_id] = warp_reduction;
}
::rocprim::syncthreads();
// Scan the warp reduction results to calculate warp prefixes
if(flat_id < warps_no)
{
unsigned int prefix = storage_.warp_prefixes[flat_id];
warp_scan_prefix_type().inclusive_scan(prefix, prefix, ::rocprim::plus<unsigned int>());
storage_.warp_prefixes[flat_id] = prefix;
}
#ifdef __HIP_CPU_RT__
else
{
// HIP-CPU doesn't implement lockstep behavior. Need to invoke the same number sync ops in divergent branch.
empty_type empty;
::rocprim::detail::warp_scan_crosslane<empty_type, detail::next_power_of_two(warps_no)>().inclusive_scan(empty, empty, empty_binary_op{});
}
#endif
::rocprim::syncthreads();
// Perform exclusive warp scan of bit values
unsigned int lane_prefix = 0;
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
lane_prefix = ::rocprim::masked_bit_count(::rocprim::ballot(input[i]), lane_prefix);
}
// Scan the lane's items and calculate final scan results
output[0] = warp_id == 0
? lane_prefix
: lane_prefix + storage_.warp_prefixes[warp_id - 1];
for(unsigned int i = 1; i < ItemsPerThread; i++)
{
output[i] = output[i - 1] + input[i - 1];
}
// Get the final inclusive reduction result
reduction = storage_.warp_prefixes[warps_no - 1];
}
};
} // end namespace detail
/// \brief The block_radix_sort class is a block level parallel primitive which provides
/// methods sorting items (keys or key-value pairs) partitioned across threads in a block
/// using radix sort algorithm.
///
/// \tparam Key - the key type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items contributed by each thread.
/// \tparam Value - the value type. Default type empty_type indicates
/// a keys-only sort.
///
/// \par Overview
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Performance depends on \p BlockSize and \p ItemsPerThread.
/// * It is usually better of \p BlockSize is a multiple of the size of the hardware warp.
/// * It is usually increased when \p ItemsPerThread is greater than one. However, when there
/// are too many items per thread, each thread may need so much registers and/or shared memory
/// that occupancy will fall too low, decreasing the performance.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 256 threads, each thread provides
/// eight \p int value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int, block of 256 threads,
/// // and eight items per thread; key-only sort
/// using block_rsort_int = rocprim::block_radix_sort<int, 256, 8>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_int::storage_type storage;
///
/// int input[8] = ...;
/// // execute block radix sort (ascending)
/// block_rsort_int().sort(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<
class Key,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
class Value = empty_type,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_radix_sort
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
static constexpr bool with_values = !std::is_same<Value, empty_type>::value;
using bit_key_type = typename ::rocprim::detail::radix_key_codec<Key>::bit_key_type;
using bit_block_scan = detail::block_bit_plus_scan<BlockSizeX, BlockSizeY, BlockSizeZ>;
using bit_keys_exchange_type = ::rocprim::block_exchange<bit_key_type, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
using values_exchange_type = ::rocprim::block_exchange<Value, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct storage_type_
{
union
{
typename bit_keys_exchange_type::storage_type bit_keys_exchange;
typename values_exchange_type::storage_type values_exchange;
};
typename block_radix_sort<Key,BlockSizeX,ItemsPerThread,Value,BlockSizeY,BlockSizeZ>::bit_block_scan::storage_type bit_block_scan;
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = detail::raw_storage<storage_type_>;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
/// \brief Performs ascending radix sort over keys partitioned across threads in a block.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two \p float value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for float, block of 128 threads,
/// // and two items per thread; key-only sort
/// using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_float::storage_type storage;
///
/// float input[2] = ...;
/// // execute block radix sort (ascending)
/// block_rsort_float().sort(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{[256, 255], ..., [4, 3], [2, 1]}}</tt>, then
/// then after sort they will be equal <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt>.
/// \endparblock
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort(Key (&keys)[ItemsPerThread],
storage_type& storage,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
empty_type values[ItemsPerThread];
sort_impl<false>(keys, values, storage, begin_bit, end_bit);
}
/// \overload
/// \brief Performs ascending radix sort over keys partitioned across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort(Key (&keys)[ItemsPerThread],
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
ROCPRIM_SHARED_MEMORY storage_type storage;
sort(keys, storage, begin_bit, end_bit);
}
/// \brief Performs descending radix sort over keys partitioned across threads in a block.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two \p float value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for float, block of 128 threads,
/// // and two items per thread; key-only sort
/// using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_float::storage_type storage;
///
/// float input[2] = ...;
/// // execute block radix sort (descending)
/// block_rsort_float().sort_desc(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt>,
/// then after sort they will be equal <tt>{[256, 255], ..., [4, 3], [2, 1]}</tt>.
/// \endparblock
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort_desc(Key (&keys)[ItemsPerThread],
storage_type& storage,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
empty_type values[ItemsPerThread];
sort_impl<true>(keys, values, storage, begin_bit, end_bit);
}
/// \overload
/// \brief Performs descending radix sort over keys partitioned across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort_desc(Key (&keys)[ItemsPerThread],
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
ROCPRIM_SHARED_MEMORY storage_type storage;
sort_desc(keys, storage, begin_bit, end_bit);
}
/// \brief Performs ascending radix sort over key-value pairs partitioned across
/// threads in a block.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
/// arrays as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int-float pairs, block of 128
/// // threads, and two items per thread
/// using block_rsort_ii = rocprim::block_radix_sort<int, 128, 2, int>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_ii::storage_type storage;
///
/// int keys[2] = ...;
/// float values[2] = ...;
/// // execute block radix sort-by-key (ascending)
/// block_rsort_ii().sort(
/// keys, values,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p keys across threads in a block are <tt>{[256, 255], ..., [4, 3], [2, 1]}</tt> and
/// the \p values are <tt>{[1, 1], [2, 2] ..., [128, 128]}</tt>, then after sort the \p keys
/// will be equal <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt> and the \p values will be
/// equal <tt>{[128, 128], [127, 127] ..., [2, 2], [1, 1]}</tt>.
/// \endparblock
template<bool WithValues = with_values>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort(Key (&keys)[ItemsPerThread],
typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
storage_type& storage,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
sort_impl<false>(keys, values, storage, begin_bit, end_bit);
}
/// \overload
/// \brief Performs ascending radix sort over key-value pairs partitioned across
/// threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
template<bool WithValues = with_values>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort(Key (&keys)[ItemsPerThread],
typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
ROCPRIM_SHARED_MEMORY storage_type storage;
sort(keys, values, storage, begin_bit, end_bit);
}
/// \brief Performs descending radix sort over key-value pairs partitioned across
/// threads in a block.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
/// arrays as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int-float pairs, block of 128
/// // threads, and two items per thread
/// using block_rsort_ii = rocprim::block_radix_sort<int, 128, 2, int>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_ii::storage_type storage;
///
/// int keys[2] = ...;
/// float values[2] = ...;
/// // execute block radix sort-by-key (descending)
/// block_rsort_ii().sort_desc(
/// keys, values,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p keys across threads in a block are <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt> and
/// the \p values are <tt>{[128, 128], [127, 127] ..., [2, 2], [1, 1]}</tt>, then after sort
/// the \p keys will be equal <tt>{[256, 255], ..., [4, 3], [2, 1]}</tt> and the \p values
/// will be equal <tt>{[1, 1], [2, 2] ..., [128, 128]}</tt>.
/// \endparblock
template<bool WithValues = with_values>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort_desc(Key (&keys)[ItemsPerThread],
typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
storage_type& storage,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
sort_impl<true>(keys, values, storage, begin_bit, end_bit);
}
/// \overload
/// \brief Performs descending radix sort over key-value pairs partitioned across
/// threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
template<bool WithValues = with_values>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort_desc(Key (&keys)[ItemsPerThread],
typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
ROCPRIM_SHARED_MEMORY storage_type storage;
sort_desc(keys, values, storage, begin_bit, end_bit);
}
/// \brief Performs ascending radix sort over keys partitioned across threads in a block,
/// results are saved in a striped arrangement.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two \p float value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for float, block of 128 threads,
/// // and two items per thread; key-only sort
/// using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_float::storage_type storage;
///
/// float keys[2] = ...;
/// // execute block radix sort (ascending)
/// block_rsort_float().sort_to_striped(
/// keys,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{[256, 255], ..., [4, 3], [2, 1]}}</tt>, then
/// then after sort they will be equal <tt>{[1, 129], [2, 130] ..., [128, 256]}</tt>.
/// \endparblock
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort_to_striped(Key (&keys)[ItemsPerThread],
storage_type& storage,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
empty_type values[ItemsPerThread];
sort_impl<false, true>(keys, values, storage, begin_bit, end_bit);
}
/// \overload
/// \brief Performs ascending radix sort over keys partitioned across threads in a block,
/// results are saved in a striped arrangement.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort_to_striped(Key (&keys)[ItemsPerThread],
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
ROCPRIM_SHARED_MEMORY storage_type storage;
sort_to_striped(keys, storage, begin_bit, end_bit);
}
/// \brief Performs descending radix sort over keys partitioned across threads in a block,
/// results are saved in a striped arrangement.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 128 threads, each thread provides
/// two \p float value, results are returned using the same array as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for float, block of 128 threads,
/// // and two items per thread; key-only sort
/// using block_rsort_float = rocprim::block_radix_sort<float, 128, 2>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_float::storage_type storage;
///
/// float input[2] = ...;
/// // execute block radix sort (descending)
/// block_rsort_float().sort_desc_to_striped(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{[1, 2], [3, 4] ..., [255, 256]}</tt>,
/// then after sort they will be equal <tt>{[256, 128], ..., [130, 2], [129, 1]}</tt>.
/// \endparblock
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort_desc_to_striped(Key (&keys)[ItemsPerThread],
storage_type& storage,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
empty_type values[ItemsPerThread];
sort_impl<true, true>(keys, values, storage, begin_bit, end_bit);
}
/// \overload
/// \brief Performs descending radix sort over keys partitioned across threads in a block,
/// results are saved in a striped arrangement.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort_desc_to_striped(Key (&keys)[ItemsPerThread],
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
ROCPRIM_SHARED_MEMORY storage_type storage;
sort_desc_to_striped(keys, storage, begin_bit, end_bit);
}
/// \brief Performs ascending radix sort over key-value pairs partitioned across
/// threads in a block, results are saved in a striped arrangement.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 4 threads, each thread provides
/// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
/// arrays as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int-float pairs, block of 4
/// // threads, and two items per thread
/// using block_rsort_ii = rocprim::block_radix_sort<int, 4, 2, int>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_ii::storage_type storage;
///
/// int keys[2] = ...;
/// float values[2] = ...;
/// // execute block radix sort-by-key (ascending)
/// block_rsort_ii().sort_to_striped(
/// keys, values,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p keys across threads in a block are <tt>{[8, 7], [6, 5], [4, 3], [2, 1]}</tt> and
/// the \p values are <tt>{[-1, -2], [-3, -4], [-5, -6], [-7, -8]}</tt>, then after sort the
/// \p keys will be equal <tt>{[1, 5], [2, 6], [3, 7], [4, 8]}</tt> and the \p values will be
/// equal <tt>{[-8, -4], [-7, -3], [-6, -2], [-5, -1]}</tt>.
/// \endparblock
template<bool WithValues = with_values>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort_to_striped(Key (&keys)[ItemsPerThread],
typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
storage_type& storage,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
sort_impl<false, true>(keys, values, storage, begin_bit, end_bit);
}
/// \overload
/// \brief Performs ascending radix sort over key-value pairs partitioned across
/// threads in a block, results are saved in a striped arrangement.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
template<bool WithValues = with_values>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort_to_striped(Key (&keys)[ItemsPerThread],
typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
ROCPRIM_SHARED_MEMORY storage_type storage;
sort_to_striped(keys, values, storage, begin_bit, end_bit);
}
/// \brief Performs descending radix sort over key-value pairs partitioned across
/// threads in a block, results are saved in a striped arrangement.
///
/// \pre Method is enabled only if \p Value type is different than empty_type.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples radix sort is performed on a block of 4 threads, each thread provides
/// two key-value <tt>int</tt>-<tt>float</tt> pairs, results are returned using the same
/// arrays as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_radix_sort for int-float pairs, block of 4
/// // threads, and two items per thread
/// using block_rsort_ii = rocprim::block_radix_sort<int, 4, 2, int>;
/// // allocate storage in shared memory
/// __shared__ block_rsort_ii::storage_type storage;
///
/// int keys[2] = ...;
/// float values[2] = ...;
/// // execute block radix sort-by-key (descending)
/// block_rsort_ii().sort_desc_to_striped(
/// keys, values,
/// storage
/// );
/// ...
/// }
/// \endcode
///
/// If the \p keys across threads in a block are <tt>{[1, 2], [3, 4], [5, 6], [7, 8]}</tt> and
/// the \p values are <tt>{[80, 70], [60, 50], [40, 30], [20, 10]}</tt>, then after sort the
/// \p keys will be equal <tt>{[8, 4], [7, 3], [6, 2], [5, 1]}</tt> and the \p values will be
/// equal <tt>{[10, 50], [20, 60], [30, 70], [40, 80]}</tt>.
/// \endparblock
template<bool WithValues = with_values>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort_desc_to_striped(Key (&keys)[ItemsPerThread],
typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
storage_type& storage,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
sort_impl<true, true>(keys, values, storage, begin_bit, end_bit);
}
/// \overload
/// \brief Performs descending radix sort over key-value pairs partitioned across
/// threads in a block, results are saved in a striped arrangement.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \param [in, out] keys - reference to an array of keys provided by a thread.
/// \param [in, out] values - reference to an array of values provided by a thread.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>.
template<bool WithValues = with_values>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort_desc_to_striped(Key (&keys)[ItemsPerThread],
typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key))
{
ROCPRIM_SHARED_MEMORY storage_type storage;
sort_desc_to_striped(keys, values, storage, begin_bit, end_bit);
}
private:
template<bool Descending, bool ToStriped = false, class SortedValue>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort_impl(Key (&keys)[ItemsPerThread],
SortedValue (&values)[ItemsPerThread],
storage_type& storage,
unsigned int begin_bit,
unsigned int end_bit)
{
using key_codec = ::rocprim::detail::radix_key_codec<Key, Descending>;
storage_type_& storage_ = storage.get();
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
bit_key_type bit_keys[ItemsPerThread];
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
bit_keys[i] = key_codec::encode(keys[i]);
}
// Use binary digits (i.e. digits can be 0 or 1)
for(unsigned int bit = begin_bit; bit < end_bit; bit++)
{
unsigned int bits[ItemsPerThread];
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
bits[i] = key_codec::extract_digit(bit_keys[i], bit, 1);
}
unsigned int ranks[ItemsPerThread];
#ifdef __HIP_CPU_RT__
// TODO: Check if really necessary
// Initialize contents, as non-hipcc compilers don't unconditionally zero out allocated memory
std::memset(ranks, 0, ItemsPerThread * sizeof(decltype(ranks[0])));
#endif
unsigned int count;
bit_block_scan().exclusive_scan(bits, ranks, count, storage_.bit_block_scan);
// Scatter keys to computed positions considering starting positions of their digit values
const unsigned int start = BlockSize * ItemsPerThread - count;
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
// Calculate position for the first digit (0) value based on positions of the second (1)
ranks[i] = bits[i] != 0
? (start + ranks[i])
: (flat_id * ItemsPerThread + i - ranks[i]);
}
exchange_keys(storage, bit_keys, ranks);
exchange_values(storage, values, ranks);
}
if(ToStriped)
{
to_striped_keys(storage, bit_keys);
to_striped_values(storage, values);
}
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
keys[i] = key_codec::decode(bit_keys[i]);
}
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void exchange_keys(storage_type& storage,
bit_key_type (&bit_keys)[ItemsPerThread],
const unsigned int (&ranks)[ItemsPerThread])
{
storage_type_& storage_ = storage.get();
// Synchronization is omitted here because bit_block_scan already calls it
bit_keys_exchange_type().scatter_to_blocked(bit_keys, bit_keys, ranks, storage_.bit_keys_exchange);
}
template<class SortedValue>
ROCPRIM_DEVICE ROCPRIM_INLINE
void exchange_values(storage_type& storage,
SortedValue (&values)[ItemsPerThread],
const unsigned int (&ranks)[ItemsPerThread])
{
storage_type_& storage_ = storage.get();
::rocprim::syncthreads(); // Storage will be reused (union), synchronization is needed
values_exchange_type().scatter_to_blocked(values, values, ranks, storage_.values_exchange);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void exchange_values(storage_type& storage,
empty_type (&values)[ItemsPerThread],
const unsigned int (&ranks)[ItemsPerThread])
{
(void) storage;
(void) values;
(void) ranks;
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void to_striped_keys(storage_type& storage,
bit_key_type (&bit_keys)[ItemsPerThread])
{
storage_type_& storage_ = storage.get();
::rocprim::syncthreads();
bit_keys_exchange_type().blocked_to_striped(bit_keys, bit_keys, storage_.bit_keys_exchange);
}
template<class SortedValue>
ROCPRIM_DEVICE ROCPRIM_INLINE
void to_striped_values(storage_type& storage,
SortedValue (&values)[ItemsPerThread])
{
storage_type_& storage_ = storage.get();
::rocprim::syncthreads(); // Storage will be reused (union), synchronization is needed
values_exchange_type().blocked_to_striped(values, values, storage_.values_exchange);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void to_striped_values(storage_type& storage,
empty_type * values)
{
(void) storage;
(void) values;
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
#define ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_reduce_warp_reduce.hpp"
#include "detail/block_reduce_raking_reduce.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Available algorithms for block_reduce primitive.
enum class block_reduce_algorithm
{
/// \brief A warp_reduce based algorithm.
using_warp_reduce,
/// \brief An algorithm which limits calculations to a single hardware warp.
raking_reduce,
/// \brief raking reduce that supports only commutative operators
raking_reduce_commutative_only,
/// \brief Default block_reduce algorithm.
default_algorithm = using_warp_reduce,
};
namespace detail
{
// Selector for block_reduce algorithm which gives block reduce implementation
// type based on passed block_reduce_algorithm enum
template<block_reduce_algorithm Algorithm>
struct select_block_reduce_impl;
template<>
struct select_block_reduce_impl<block_reduce_algorithm::using_warp_reduce>
{
template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
using type = block_reduce_warp_reduce<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
};
template<>
struct select_block_reduce_impl<block_reduce_algorithm::raking_reduce>
{
template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
using type = block_reduce_raking_reduce<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
};
template<>
struct select_block_reduce_impl<block_reduce_algorithm::raking_reduce_commutative_only>
{
template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
using type = block_reduce_raking_reduce<T, BlockSizeX, BlockSizeY, BlockSizeZ, true>;
};
} // end namespace detail
/// \brief The block_reduce class is a block level parallel primitive which provides methods
/// for performing reductions operations on items partitioned across threads in a block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam Algorithm - selected reduce algorithm, block_reduce_algorithm::default_algorithm by default.
///
/// \par Overview
/// * Supports non-commutative reduce operators. However, a reduce operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Computation can more efficient when:
/// * \p ItemsPerThread is greater than one,
/// * \p T is an arithmetic type,
/// * reduce operation is simple addition operator, and
/// * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
/// * block_reduce has two alternative implementations: \p block_reduce_algorithm::using_warp_reduce,
/// block_reduce_algorithm::raking_reduce and block_reduce_algorithm::raking_reduce_commutative_only.
/// * If the block sizes less than 64 only one warp reduction is used. The block reduction algorithm
/// stores the result only in the first thread(lane_id = 0 warp_id = 0), when the block size is
/// larger then the warp size.
///
/// \par Examples
/// \parblock
/// In the examples reduce operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize warp_reduce for int and logical warp of 192 threads
/// using block_reduce_int = rocprim::block_reduce<int, 192>;
/// // allocate storage in shared memory
/// __shared__ block_reduce_int::storage_type storage;
///
/// int value = ...;
/// // execute reduce
/// block_reduce_int().reduce(
/// value, // input
/// value, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
block_reduce_algorithm Algorithm = block_reduce_algorithm::default_algorithm,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_reduce
#ifndef DOXYGEN_SHOULD_SKIP_THIS
: private detail::select_block_reduce_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>
#endif
{
using base_type = typename detail::select_block_reduce_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
using storage_type = typename base_type::storage_type;
/// \brief Performs reduction across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present min reduce operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_reduce for float and block of 256 threads
/// using block_reduce_f = rocprim::block_reduce<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_reduce_float::storage_type storage;
///
/// float input = ...;
/// float output;
/// // execute min reduce
/// block_reduce_float().reduce(
/// input,
/// output,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>, then
/// \p output value will be <tt>{-256}</tt>.
/// \endparblock
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void reduce(T input,
T& output,
storage_type& storage,
BinaryFunction reduce_op = BinaryFunction())
{
base_type::reduce(input, output, storage, reduce_op);
}
/// \overload
/// \brief Performs reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void reduce(T input,
T& output,
BinaryFunction reduce_op = BinaryFunction())
{
base_type::reduce(input, output, reduce_op);
}
/// \brief Performs reduction across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present maximum reduce operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_reduce for long and block of 128 threads
/// using block_reduce_f = rocprim::block_reduce<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_reduce_long::storage_type storage;
///
/// long input[2] = ...;
/// long output[2];
/// // execute max reduce
/// block_reduce_long().reduce(
/// input,
/// output,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>, then
/// \p output value will be <tt>{256}</tt>.
/// \endparblock
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void reduce(T (&input)[ItemsPerThread],
T& output,
storage_type& storage,
BinaryFunction reduce_op = BinaryFunction())
{
base_type::reduce(input, output, storage, reduce_op);
}
/// \overload
/// \brief Performs reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void reduce(T (&input)[ItemsPerThread],
T& output,
BinaryFunction reduce_op = BinaryFunction())
{
base_type::reduce(input, output, reduce_op);
}
/// \brief Performs reduction across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] valid_items - number of items that will be reduced in the block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present min reduce operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_reduce for float and block of 256 threads
/// using block_reduce_f = rocprim::block_reduce<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_reduce_float::storage_type storage;
///
/// float input = ...;
/// unsigned int valid_items = 250;
/// float output;
/// // execute min reduce
/// block_reduce_float().reduce(
/// input,
/// output,
/// valid_items,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void reduce(T input,
T& output,
unsigned int valid_items,
storage_type& storage,
BinaryFunction reduce_op = BinaryFunction())
{
base_type::reduce(input, output, valid_items, storage, reduce_op);
}
/// \overload
/// \brief Performs reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for reduce. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] valid_items - number of items that will be reduced in the block.
/// \param [in] reduce_op - binary operation function object that will be used for reduce.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void reduce(T input,
T& output,
unsigned int valid_items,
BinaryFunction reduce_op = BinaryFunction())
{
base_type::reduce(input, output, valid_items, reduce_op);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
#define ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_scan_warp_scan.hpp"
#include "detail/block_scan_reduce_then_scan.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Available algorithms for block_scan primitive.
enum class block_scan_algorithm
{
/// \brief A warp_scan based algorithm.
using_warp_scan,
/// \brief An algorithm which limits calculations to a single hardware warp.
reduce_then_scan,
/// \brief Default block_scan algorithm.
default_algorithm = using_warp_scan,
};
namespace detail
{
// Selector for block_scan algorithm which gives block scan implementation
// type based on passed block_scan_algorithm enum
template<block_scan_algorithm Algorithm>
struct select_block_scan_impl;
template<>
struct select_block_scan_impl<block_scan_algorithm::using_warp_scan>
{
template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
using type = block_scan_warp_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
};
template<>
struct select_block_scan_impl<block_scan_algorithm::reduce_then_scan>
{
template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
// When BlockSize is less than hardware warp size block_scan_warp_scan performs better than
// block_scan_reduce_then_scan by specializing for warps
using type = typename std::conditional<
(BlockSizeX * BlockSizeY * BlockSizeZ <= ::rocprim::device_warp_size()),
block_scan_warp_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>,
block_scan_reduce_then_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>
>::type;
};
} // end namespace detail
/// \brief The block_scan class is a block level parallel primitive which provides methods
/// for performing inclusive and exclusive scan operations of items partitioned across
/// threads in a block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSizeX - the number of threads in a block's x dimension.
/// \tparam Algorithm - selected scan algorithm, block_scan_algorithm::default_algorithm by default.
/// \tparam BlockSizeY - the number of threads in a block's y dimension, defaults to 1.
/// \tparam BlockSizeZ - the number of threads in a block's z dimension, defaults to 1.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Computation can more efficient when:
/// * \p ItemsPerThread is greater than one,
/// * \p T is an arithmetic type,
/// * scan operation is simple addition operator, and
/// * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
/// * block_scan has two alternative implementations: \p block_scan_algorithm::using_warp_scan
/// and block_scan_algorithm::reduce_then_scan.
///
/// \par Examples
/// \parblock
/// In the examples scan operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize warp_scan for int and logical warp of 192 threads
/// using block_scan_int = rocprim::block_scan<int, 192>;
/// // allocate storage in shared memory
/// __shared__ block_scan_int::storage_type storage;
///
/// int value = ...;
/// // execute inclusive scan
/// block_scan_int().inclusive_scan(
/// value, // input
/// value, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
block_scan_algorithm Algorithm = block_scan_algorithm::default_algorithm,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_scan
#ifndef DOXYGEN_SHOULD_SKIP_THIS
: private detail::select_block_scan_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>
#endif
{
using base_type = typename detail::select_block_scan_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
using storage_type = typename base_type::storage_type;
/// \brief Performs inclusive scan across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive min scan operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for float and block of 256 threads
/// using block_scan_f = rocprim::block_scan<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_float::storage_type storage;
///
/// float input = ...;
/// float output;
/// // execute inclusive min scan
/// block_scan_float().inclusive_scan(
/// input,
/// output,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>, then
/// \p output values in will be <tt>{1, -2, -2, -4, ..., -254, -256}</tt>.
/// \endparblock
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void inclusive_scan(T input,
T& output,
storage_type& storage,
BinaryFunction scan_op = BinaryFunction())
{
base_type::inclusive_scan(input, output, storage, scan_op);
}
/// \overload
/// \brief Performs inclusive scan across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void inclusive_scan(T input,
T& output,
BinaryFunction scan_op = BinaryFunction())
{
base_type::inclusive_scan(input, output, scan_op);
}
/// \brief Performs inclusive scan and reduction across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive min scan operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for float and block of 256 threads
/// using block_scan_f = rocprim::block_scan<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_float::storage_type storage;
///
/// float input = ...;
/// float output;
/// float reduction;
/// // execute inclusive min scan
/// block_scan_float().inclusive_scan(
/// input,
/// output,
/// reduction,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>, then
/// \p output values in will be <tt>{1, -2, -2, -4, ..., -254, -256}</tt>, and the \p reduction will
/// be <tt>-256</tt>.
/// \endparblock
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void inclusive_scan(T input,
T& output,
T& reduction,
storage_type& storage,
BinaryFunction scan_op = BinaryFunction())
{
base_type::inclusive_scan(input, output, reduction, storage, scan_op);
}
/// \overload
/// \brief Performs inclusive scan and reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void inclusive_scan(T input,
T& output,
T& reduction,
BinaryFunction scan_op = BinaryFunction())
{
base_type::inclusive_scan(input, output, reduction, scan_op);
}
/// \brief Performs inclusive scan across threads in a block, and uses
/// \p prefix_callback_op to generate prefix value for the whole block.
///
/// \tparam PrefixCallback - type of the unary function object used for generating
/// block-wide prefix value for the scan operation.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in,out] prefix_callback_op - function object for generating block prefix value.
/// The signature of the \p prefix_callback_op should be equivalent to the following:
/// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The object will be called by the first warp of the block with block reduction of
/// \p input values as input argument. The result of the first thread will be used as the
/// block-wide prefix.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive prefix sum operations performed on a block of 256 threads,
/// each thread provides one \p int value.
///
/// \code{.cpp}
///
/// struct my_block_prefix
/// {
/// int prefix;
///
/// __device__ my_block_prefix(int prefix) : prefix(prefix) {}
///
/// __device__ int operator()(int block_reduction)
/// {
/// int old_prefix = prefix;
/// prefix = prefix + block_reduction;
/// return old_prefix;
/// }
/// };
///
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for int and block of 256 threads
/// using block_scan_f = rocprim::block_scan<int, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_int::storage_type storage;
///
/// // init prefix functor
/// my_block_prefix prefix_callback(10);
///
/// int input;
/// int output;
/// // execute inclusive prefix sum
/// block_scan_int().inclusive_scan(
/// input,
/// output,
/// storage,
/// prefix_callback,
/// rocprim::plus<int>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
/// \p output values in will be <tt>{11, 12, 13, ..., 266}</tt>, and the \p prefix will
/// be <tt>266</tt>.
/// \endparblock
template<
class PrefixCallback,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void inclusive_scan(T input,
T& output,
storage_type& storage,
PrefixCallback& prefix_callback_op,
BinaryFunction scan_op)
{
base_type::inclusive_scan(input, output, storage, prefix_callback_op, scan_op);
}
/// \brief Performs inclusive scan across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive maximum scan operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for long and block of 128 threads
/// using block_scan_f = rocprim::block_scan<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_long::storage_type storage;
///
/// long input[2] = ...;
/// long output[2];
/// // execute inclusive min scan
/// block_scan_long().inclusive_scan(
/// input,
/// output,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>, then
/// \p output values in will be <tt>{-1, 2, 2, 4, ..., 254, 256}</tt>.
/// \endparblock
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void inclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
storage_type& storage,
BinaryFunction scan_op = BinaryFunction())
{
if(ItemsPerThread == 1)
{
base_type::inclusive_scan(input[0], output[0], storage, scan_op);
}
else
{
base_type::inclusive_scan(input, output, storage, scan_op);
}
}
/// \overload
/// \brief Performs inclusive scan across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void inclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
BinaryFunction scan_op = BinaryFunction())
{
if(ItemsPerThread == 1)
{
base_type::inclusive_scan(input[0], output[0], scan_op);
}
else
{
base_type::inclusive_scan(input, output, scan_op);
}
}
/// \brief Performs inclusive scan and reduction across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive maximum scan operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for long and block of 128 threads
/// using block_scan_f = rocprim::block_scan<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_long::storage_type storage;
///
/// long input[2] = ...;
/// long output[2];
/// long reduction;
/// // execute inclusive min scan
/// block_scan_long().inclusive_scan(
/// input,
/// output,
/// reduction,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>, then
/// \p output values in will be <tt>{-1, 2, 2, 4, ..., 254, 256}</tt> and the \p reduction will be \p 256.
/// \endparblock
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void inclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
T& reduction,
storage_type& storage,
BinaryFunction scan_op = BinaryFunction())
{
if(ItemsPerThread == 1)
{
base_type::inclusive_scan(input[0], output[0], reduction, storage, scan_op);
}
else
{
base_type::inclusive_scan(input, output, reduction, storage, scan_op);
}
}
/// \overload
/// \brief Performs inclusive scan and reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void inclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
T& reduction,
BinaryFunction scan_op = BinaryFunction())
{
if(ItemsPerThread == 1)
{
base_type::inclusive_scan(input[0], output[0], reduction, scan_op);
}
else
{
base_type::inclusive_scan(input, output, reduction, scan_op);
}
}
/// \brief Performs inclusive scan across threads in a block, and uses
/// \p prefix_callback_op to generate prefix value for the whole block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam PrefixCallback - type of the unary function object used for generating
/// block-wide prefix value for the scan operation.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in,out] prefix_callback_op - function object for generating block prefix value.
/// The signature of the \p prefix_callback_op should be equivalent to the following:
/// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The object will be called by the first warp of the block with block reduction of
/// \p input values as input argument. The result of the first thread will be used as the
/// block-wide prefix.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present inclusive prefix sum operations performed on a block of 128 threads,
/// each thread provides two \p int value.
///
/// \code{.cpp}
///
/// struct my_block_prefix
/// {
/// int prefix;
///
/// __device__ my_block_prefix(int prefix) : prefix(prefix) {}
///
/// __device__ int operator()(int block_reduction)
/// {
/// int old_prefix = prefix;
/// prefix = prefix + block_reduction;
/// return old_prefix;
/// }
/// };
///
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for int and block of 128 threads
/// using block_scan_f = rocprim::block_scan<int, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_int::storage_type storage;
///
/// // init prefix functor
/// my_block_prefix prefix_callback(10);
///
/// int input[2] = ...;
/// int output[2];
/// // execute inclusive prefix sum
/// block_scan_int().inclusive_scan(
/// input,
/// output,
/// storage,
/// prefix_callback,
/// rocprim::plus<int>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
/// \p output values in will be <tt>{11, 12, 13, ..., 266}</tt>, and the \p prefix will
/// be <tt>266</tt>.
/// \endparblock
template<
unsigned int ItemsPerThread,
class PrefixCallback,
class BinaryFunction
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void inclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
storage_type& storage,
PrefixCallback& prefix_callback_op,
BinaryFunction scan_op)
{
if(ItemsPerThread == 1)
{
base_type::inclusive_scan(input[0], output[0], storage, prefix_callback_op, scan_op);
}
else
{
base_type::inclusive_scan(input, output, storage, prefix_callback_op, scan_op);
}
}
/// \brief Performs exclusive scan across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive min scan operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for float and block of 256 threads
/// using block_scan_f = rocprim::block_scan<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_float::storage_type storage;
///
/// float init = ...;
/// float input = ...;
/// float output;
/// // execute exclusive min scan
/// block_scan_float().exclusive_scan(
/// input,
/// output,
/// init,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>
/// and \p init is \p 0, then \p output values in will be <tt>{0, 0, -2, -2, -4, ..., -254, -254}</tt>.
/// \endparblock
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void exclusive_scan(T input,
T& output,
T init,
storage_type& storage,
BinaryFunction scan_op = BinaryFunction())
{
base_type::exclusive_scan(input, output, init, storage, scan_op);
}
/// \overload
/// \brief Performs exclusive scan across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void exclusive_scan(T input,
T& output,
T init,
BinaryFunction scan_op = BinaryFunction())
{
base_type::exclusive_scan(input, output, init, scan_op);
}
/// \brief Performs exclusive scan and reduction across threads in a block.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive min scan operations performed on a block of 256 threads,
/// each provides one \p float value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for float and block of 256 threads
/// using block_scan_f = rocprim::block_scan<float, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_float::storage_type storage;
///
/// float init = 0;
/// float input = ...;
/// float output;
/// float reduction;
/// // execute exclusive min scan
/// block_scan_float().exclusive_scan(
/// input,
/// output,
/// init,
/// reduction,
/// storage,
/// rocprim::minimum<float>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, -2, 3, -4, ..., 255, -256}</tt>
/// and \p init is \p 0, then \p output values in will be <tt>{0, 0, -2, -2, -4, ..., -254, -254}</tt>
/// and the \p reduction will be \p -256.
/// \endparblock
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void exclusive_scan(T input,
T& output,
T init,
T& reduction,
storage_type& storage,
BinaryFunction scan_op = BinaryFunction())
{
base_type::exclusive_scan(input, output, init, reduction, storage, scan_op);
}
/// \overload
/// \brief Performs exclusive scan and reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::plus<T>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void exclusive_scan(T input,
T& output,
T init,
T& reduction,
BinaryFunction scan_op = BinaryFunction())
{
base_type::exclusive_scan(input, output, init, reduction, scan_op);
}
/// \brief Performs exclusive scan across threads in a block, and uses
/// \p prefix_callback_op to generate prefix value for the whole block.
///
/// \tparam PrefixCallback - type of the unary function object used for generating
/// block-wide prefix value for the scan operation.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - thread input value.
/// \param [out] output - reference to a thread output value. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in,out] prefix_callback_op - function object for generating block prefix value.
/// The signature of the \p prefix_callback_op should be equivalent to the following:
/// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The object will be called by the first warp of the block with block reduction of
/// \p input values as input argument. The result of the first thread will be used as the
/// block-wide prefix.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive prefix sum operations performed on a block of 256 threads,
/// each thread provides one \p int value.
///
/// \code{.cpp}
///
/// struct my_block_prefix
/// {
/// int prefix;
///
/// __device__ my_block_prefix(int prefix) : prefix(prefix) {}
///
/// __device__ int operator()(int block_reduction)
/// {
/// int old_prefix = prefix;
/// prefix = prefix + block_reduction;
/// return old_prefix;
/// }
/// };
///
/// __global__ void example_kernel(...) // blockDim.x = 256
/// {
/// // specialize block_scan for int and block of 256 threads
/// using block_scan_f = rocprim::block_scan<int, 256>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_int::storage_type storage;
///
/// // init prefix functor
/// my_block_prefix prefix_callback(10);
///
/// int input;
/// int output;
/// // execute exclusive prefix sum
/// block_scan_int().exclusive_scan(
/// input,
/// output,
/// storage,
/// prefix_callback,
/// rocprim::plus<int>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
/// \p output values in will be <tt>{10, 11, 12, 13, ..., 265}</tt>, and the \p prefix will
/// be <tt>266</tt>.
/// \endparblock
template<
class PrefixCallback,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void exclusive_scan(T input,
T& output,
storage_type& storage,
PrefixCallback& prefix_callback_op,
BinaryFunction scan_op)
{
base_type::exclusive_scan(input, output, storage, prefix_callback_op, scan_op);
}
/// \brief Performs exclusive scan across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive maximum scan operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for long and block of 128 threads
/// using block_scan_f = rocprim::block_scan<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_long::storage_type storage;
///
/// long init = ...;
/// long input[2] = ...;
/// long output[2];
/// // execute exclusive min scan
/// block_scan_long().exclusive_scan(
/// input,
/// output,
/// init,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>
/// and \p init is 0, then \p output values in will be <tt>{0, 0, 2, 2, 4, ..., 254, 254}</tt>.
/// \endparblock
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void exclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
T init,
storage_type& storage,
BinaryFunction scan_op = BinaryFunction())
{
if(ItemsPerThread == 1)
{
base_type::exclusive_scan(input[0], output[0], init, storage, scan_op);
}
else
{
base_type::exclusive_scan(input, output, init, storage, scan_op);
}
}
/// \overload
/// \brief Performs exclusive scan across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void exclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
T init,
BinaryFunction scan_op = BinaryFunction())
{
if(ItemsPerThread == 1)
{
base_type::exclusive_scan(input[0], output[0], init, scan_op);
}
else
{
base_type::exclusive_scan(input, output, init, scan_op);
}
}
/// \brief Performs exclusive scan and reduction across threads in a block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive maximum scan operations performed on a block of 128 threads,
/// each provides two \p long value.
///
/// \code{.cpp}
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for long and block of 128 threads
/// using block_scan_f = rocprim::block_scan<long, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_long::storage_type storage;
///
/// long init = ...;
/// long input[2] = ...;
/// long output[2];
/// long reduction;
/// // execute exclusive min scan
/// block_scan_long().exclusive_scan(
/// input,
/// output,
/// init,
/// reduction,
/// storage,
/// rocprim::maximum<long>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{-1, 2, -3, 4, ..., -255, 256}</tt>
/// and \p init is 0, then \p output values in will be <tt>{0, 0, 2, 2, 4, ..., 254, 254}</tt>
/// and the \p reduction will be \p 256.
/// \endparblock
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void exclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
T init,
T& reduction,
storage_type& storage,
BinaryFunction scan_op = BinaryFunction())
{
if(ItemsPerThread == 1)
{
base_type::exclusive_scan(input[0], output[0], init, reduction, storage, scan_op);
}
else
{
base_type::exclusive_scan(input, output, init, reduction, storage, scan_op);
}
}
/// \overload
/// \brief Performs exclusive scan and reduction across threads in a block.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] init - initial value used to start the exclusive scan. Should be the same
/// for all threads in a block.
/// \param [out] reduction - result of reducing of all \p input values in a block.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<
unsigned int ItemsPerThread,
class BinaryFunction = ::rocprim::plus<T>
>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void exclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
T init,
T& reduction,
BinaryFunction scan_op = BinaryFunction())
{
if(ItemsPerThread == 1)
{
base_type::exclusive_scan(input[0], output[0], init, reduction, scan_op);
}
else
{
base_type::exclusive_scan(input, output, init, reduction, scan_op);
}
}
/// \brief Performs exclusive scan across threads in a block, and uses
/// \p prefix_callback_op to generate prefix value for the whole block.
///
/// \tparam ItemsPerThread - number of items in the \p input array.
/// \tparam PrefixCallback - type of the unary function object used for generating
/// block-wide prefix value for the scan operation.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is rocprim::plus<T>.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] output - reference to a thread output array. May be aliased with \p input.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in,out] prefix_callback_op - function object for generating block prefix value.
/// The signature of the \p prefix_callback_op should be equivalent to the following:
/// <tt>T f(const T &block_reduction);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The object will be called by the first warp of the block with block reduction of
/// \p input values as input argument. The result of the first thread will be used as the
/// block-wide prefix.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// The examples present exclusive prefix sum operations performed on a block of 128 threads,
/// each thread provides two \p int value.
///
/// \code{.cpp}
///
/// struct my_block_prefix
/// {
/// int prefix;
///
/// __device__ my_block_prefix(int prefix) : prefix(prefix) {}
///
/// __device__ int operator()(int block_reduction)
/// {
/// int old_prefix = prefix;
/// prefix = prefix + block_reduction;
/// return old_prefix;
/// }
/// };
///
/// __global__ void example_kernel(...) // blockDim.x = 128
/// {
/// // specialize block_scan for int and block of 128 threads
/// using block_scan_f = rocprim::block_scan<int, 128>;
/// // allocate storage in shared memory for the block
/// __shared__ block_scan_int::storage_type storage;
///
/// // init prefix functor
/// my_block_prefix prefix_callback(10);
///
/// int input[2] = ...;
/// int output[2];
/// // execute exclusive prefix sum
/// block_scan_int().exclusive_scan(
/// input,
/// output,
/// storage,
/// prefix_callback,
/// rocprim::plus<int>()
/// );
/// ...
/// }
/// \endcode
///
/// If the \p input values across threads in a block are <tt>{1, 1, 1, ..., 1}</tt>, then
/// \p output values in will be <tt>{10, 11, 12, 13, ..., 265}</tt>, and the \p prefix will
/// be <tt>266</tt>.
/// \endparblock
template<
unsigned int ItemsPerThread,
class PrefixCallback,
class BinaryFunction
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void exclusive_scan(T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread],
storage_type& storage,
PrefixCallback& prefix_callback_op,
BinaryFunction scan_op)
{
if(ItemsPerThread == 1)
{
base_type::exclusive_scan(input[0], output[0], storage, prefix_callback_op, scan_op);
}
else
{
base_type::exclusive_scan(input, output, storage, prefix_callback_op, scan_op);
}
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
#define ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_reduce_warp_reduce.hpp"
#include "detail/block_reduce_raking_reduce.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The block_shuffle class is a block level parallel primitive which provides methods
/// for shuffling data partitioned across a block
///
/// \tparam T - the input/output type.
/// \tparam BlockSizeX - the number of threads in a block's x dimension, it has no defaults value.
/// \tparam BlockSizeY - the number of threads in a block's y dimension, defaults to 1.
/// \tparam BlockSizeZ - the number of threads in a block's z dimension, defaults to 1.
///
/// \par Overview
/// It is commonplace for blocks of threads to rearrange data items between
/// threads. The BlockShuffle abstraction allows threads to efficiently shift items
/// either (a) up to their successor or (b) down to their predecessor.
/// * Computation can more efficient when:
/// * \p ItemsPerThread is greater than one,
/// * \p T is an arithmetic type,
/// * the number of threads in the block is a multiple of the hardware warp size (see rocprim::warp_size()).
///
/// \par Examples
/// \parblock
/// In the examples shuffle operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
/// // allocate storage in shared memory
/// __shared__ block_shuffle::storage_type storage;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().inclusive_up(
/// value, // input
/// value, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1>
class block_shuffle
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct storage_type_
{
T prev[BlockSize];
T next[BlockSize];
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = detail::raw_storage<storage_type_>;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
/// \brief Shuffles data across threads in a block, offseted by the distance value.
///
/// \par A thread with threadId i receives data from a thread with threadIdx (i-distance), whre distance may be a negative value.
/// allocated by the method itself.
/// \par Any shuffle operation with invalid input or output threadIds are not carried out, i.e. threadId < 0 || threadId >= BlockSize.
///
/// \param [in] input - input data to be shuffled to another thread.
/// \param [out] output - reference to a output value, that receives data from another thread
/// \param [in] distance - The input threadId + distance = output threadId.
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().offset(
/// value, // input
/// value // output
/// );
/// ...
/// }
/// \endcode
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void offset(T input,
T& output,
int distance = 1)
{
offset(
::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
input, output, distance
);
}
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void offset(const size_t& flat_id,
T input,
T& output,
int distance)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
offset(flat_id, input, output, distance, storage);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void offset(const size_t& flat_id,
T input,
T& output,
int distance,
storage_type& storage)
{
storage_type_& storage_ = storage.get();
storage_.prev[flat_id] = input;
::rocprim::syncthreads();
const int offset_tid = static_cast<int>(flat_id) + distance;
if ((offset_tid >= 0) && (offset_tid < (int)BlockSize))
{
output = storage_.prev[static_cast<size_t>(offset_tid)];
}
}
/// \brief Shuffles data across threads in a block, offseted by the distance value.
///
/// \par A thread with threadId i receives data from a thread with threadIdx (i-distance)%BlockSize, whre distance may be a negative value.
/// allocated by the method itself.
/// \par Data is rotated around the block, using (input_threadId + distance) modulous BlockSize to ensure valid threadIds.
///
/// \param [in] input - input data to be shuffled to another thread.
/// \param [out] output - reference to a output value, that receives data from another thread
/// \param [in] distance - The input threadId + distance = output threadId.
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().rotate(
/// value, // input
/// value // output
/// );
/// ...
/// }
/// \endcode
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void rotate(T input,
T& output,
unsigned int distance = 1)
{
rotate(
::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
input, output, distance
);
}
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void rotate(const size_t& flat_id,
T input,
T& output,
unsigned int distance)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
rotate(flat_id, input, output, distance, storage);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void rotate(const size_t& flat_id,
T input,
T& output,
unsigned int distance,
storage_type& storage)
{
storage_type_& storage_ = storage.get();
storage_.prev[flat_id] = input;
::rocprim::syncthreads();
unsigned int offset = threadIdx.x + distance;
if (offset >= BlockSize)
offset -= BlockSize;
output = storage_.prev[offset];
}
/// \brief The thread block rotates a blocked arrange of input items,
/// shifting it up by one item
///
/// \param [in] input - The calling thread's input items
/// \param [out] prev - The corresponding predecessor items (may be aliased to \p input).
/// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().up(
/// value, // input
/// value // output
/// );
/// ...
/// }
/// \endcode
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void up(T (&input)[ItemsPerThread],
T (&prev)[ItemsPerThread])
{
this->up(
::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
input, prev
);
}
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void up(const size_t& flat_id,
T (&input)[ItemsPerThread],
T (&prev)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
this->up(flat_id, input, prev, storage);
}
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_INLINE
void up(const size_t& flat_id,
T (&input)[ItemsPerThread],
T (&prev)[ItemsPerThread],
storage_type& storage)
{
storage_type_& storage_ = storage.get();
storage_.prev[flat_id] = input[ItemsPerThread -1];
::rocprim::syncthreads();
ROCPRIM_UNROLL
for (unsigned int i = ItemsPerThread - 1; i > 0; --i)
{
prev[i] = input[i - 1];
}
if (flat_id > 0)
{
prev[0] = storage_.prev[flat_id - 1];
}
}
/// \brief The thread block rotates a blocked arrange of input items,
/// shifting it up by one item
///
/// \param [in] input - The calling thread's input items
/// \param [out] prev - The corresponding predecessor items (may be aliased to \p input).
/// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
/// \param [out] block_suffix - The item \p input[ItemsPerThread-1] from
/// <em>thread</em><sub><tt>BlockSize-1</tt></sub>, provided to all threads
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void up(T (&input)[ItemsPerThread],
T (&prev)[ItemsPerThread],
T &block_suffix)
{
this->up(
::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
input, prev, block_suffix
);
}
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void up(const size_t& flat_id,
T (&input)[ItemsPerThread],
T (&prev)[ItemsPerThread],
T &block_suffix)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
this->up(flat_id, input, prev, block_suffix, storage);
}
template <int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_INLINE
void up(const size_t& flat_id,
T (&input)[ItemsPerThread],
T (&prev)[ItemsPerThread],
T &block_suffix,
storage_type& storage)
{
up(flat_id, input, prev, storage);
// Update block prefix
block_suffix = storage->prev[BlockSize - 1];
}
/// \brief The thread block rotates a blocked arrange of input items,
/// shifting it down by one item
///
/// \param [in] input - The calling thread's input items
/// \param [out] next - The corresponding successor items (may be aliased to \p input).
/// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block__shuffle_int for int and logical warp of 192 threads
/// using block__shuffle_int = rocprim::block_shuffle<int, 192>;
///
/// int value = ...;
/// // execute block shuffle
/// block__shuffle_int().down(
/// value, // input
/// value // output
/// );
/// ...
/// }
/// \endcode
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void down(T (&input)[ItemsPerThread],
T (&next)[ItemsPerThread])
{
this->down(
::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
input, next
);
}
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void down(const size_t& flat_id,
T (&input)[ItemsPerThread],
T (&next)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
this->down(flat_id, input, next, storage);
}
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_INLINE
void down(const size_t& flat_id,
T (&input)[ItemsPerThread],
T (&next)[ItemsPerThread],
storage_type& storage)
{
storage_type_& storage_ = storage.get();
storage_.next[flat_id] = input[0];
::rocprim::syncthreads();
ROCPRIM_UNROLL
for (unsigned int i = 0; i < (ItemsPerThread - 1); ++i)
{
next[i] = input[i + 1];
}
if (flat_id <(BlockSize -1))
{
next[ItemsPerThread -1] = storage_.next[flat_id + 1];
}
}
/// \brief The thread block rotates a blocked arrange of input items,
/// shifting it down by one item
///
/// \param [in] input - The calling thread's input items
/// \param [out] next - The corresponding successor items (may be aliased to \p input).
/// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
/// \param [out] block_prefix - The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void down(T (&input)[ItemsPerThread],
T (&next)[ItemsPerThread],
T &block_prefix)
{
this->down(
::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
input, next, block_prefix
);
}
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void down(const size_t& flat_id,
T (&input)[ItemsPerThread],
T (&next)[ItemsPerThread],
T &block_prefix)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
this->down(flat_id, input, next, block_prefix, storage);
}
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_INLINE
void down(const size_t& flat_id,
T (&input)[ItemsPerThread],
T (&next)[ItemsPerThread],
T &block_prefix,
storage_type& storage)
{
this->down(flat_id, input, next, storage);
// Update block prefixstorage_->
block_prefix = storage->next[0];
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_SORT_HPP_
#define ROCPRIM_BLOCK_BLOCK_SORT_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_sort_bitonic.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Available algorithms for block_sort primitive.
enum class block_sort_algorithm
{
/// \brief A bitonic sort based algorithm.
bitonic_sort,
/// \brief Default block_sort algorithm.
default_algorithm = bitonic_sort,
};
namespace detail
{
// Selector for block_sort algorithm which gives block sort implementation
// type based on passed block_sort_algorithm enum
template<block_sort_algorithm Algorithm>
struct select_block_sort_impl;
template<>
struct select_block_sort_impl<block_sort_algorithm::bitonic_sort>
{
template <class Key,
unsigned int BlockSizeX,
unsigned int BlockSizeY,
unsigned int BlockSizeZ,
unsigned int ItemsPerThread,
class Value>
using type = block_sort_bitonic<Key, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Value>;
};
} // end namespace detail
/// \brief The block_sort class is a block level parallel primitive which provides
/// methods sorting items (keys or key-value pairs) partitioned across threads in a block
/// using comparison-based sort algorithm.
///
/// \tparam Key - the key type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - number of items processed by each thread.
/// The total range will be BlockSize * ItemsPerThread long
/// \tparam Value - the value type. Default type empty_type indicates
/// a keys-only sort.
/// \tparam Algorithm - selected sort algorithm, block_sort_algorithm::default_algorithm by default.
///
/// \par Overview
/// * Accepts custom compare_functions for sorting across a block.
/// * Performance depends on \p BlockSize.
/// * It is better if \p BlockSize is a power of two.
/// * If \p BlockSize is not a power of two, or when function with \p size overload is used
/// odd-even sort is used instead of bitonic sort, leading to decreased performance.
///
/// \par Examples
/// \parblock
/// In the examples sort is performed on a block of 256 threads, each thread provides
/// one \p int value, results are returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_sort for int, block of 256 threads,
/// // key-only sort
/// using block_sort_int = rocprim::block_sort<int, 256>;
/// // allocate storage in shared memory
/// __shared__ block_sort_int::storage_type storage;
///
/// int input = ...;
/// // execute block sort (ascending)
/// block_sort_int().sort(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<
class Key,
unsigned int BlockSizeX,
unsigned int ItemsPerThread = 1,
class Value = empty_type,
block_sort_algorithm Algorithm = block_sort_algorithm::default_algorithm,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_sort
#ifndef DOXYGEN_SHOULD_SKIP_THIS
: private detail::select_block_sort_impl<Algorithm>::template type<Key, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Value>
#endif
{
using base_type = typename detail::select_block_sort_impl<Algorithm>::template type<Key, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Value>;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
using storage_type = typename base_type::storage_type;
/// \brief Block sort for any data type.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort(Key& thread_key,
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_key, compare_function);
}
template <class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort(Key (&thread_keys)[ItemsPerThread],
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_keys, compare_function);
}
/// \brief Block sort for any data type.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples sort is performed on a block of 256 threads, each thread provides
/// one \p int value, results are returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_sort for int, block of 256 threads,
/// // key-only sort
/// using block_sort_int = rocprim::block_sort<int, 256>;
/// // allocate storage in shared memory
/// __shared__ block_sort_int::storage_type storage;
///
/// int input = ...;
/// // execute block sort (ascending)
/// block_sort_int().sort(
/// input,
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort(Key& thread_key,
storage_type& storage,
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_key, storage, compare_function);
}
template <class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort(Key (&thread_keys)[ItemsPerThread],
storage_type& storage,
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_keys, storage, compare_function);
}
/// \brief Block sort by key for any data type.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in, out] thread_value - reference to a value provided by a thread.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort(Key& thread_key,
Value& thread_value,
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_key, thread_value, compare_function);
}
template<class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort(Key (&thread_keys)[ItemsPerThread],
Value (&thread_values)[ItemsPerThread],
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_keys, thread_values, compare_function);
}
/// \brief Block sort by key for any data type.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in, out] thread_value - reference to a value provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \parblock
/// In the examples sort is performed on a block of 256 threads, each thread provides
/// one \p int key and one \p int value, results are returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_sort for int, block of 256 threads,
/// using block_sort_int = rocprim::block_sort<int, 256, int>;
/// // allocate storage in shared memory
/// __shared__ block_sort_int::storage_type storage;
///
/// int key = ...;
/// int value = ...;
/// // execute block sort (ascending)
/// block_sort_int().sort(
/// key,
/// value,
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort(Key& thread_key,
Value& thread_value,
storage_type& storage,
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_key, thread_value, storage, compare_function);
}
template<class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort(Key (&thread_keys)[ItemsPerThread],
Value (&thread_values)[ItemsPerThread],
storage_type& storage,
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_keys, thread_values, storage, compare_function);
}
/// \brief Block sort by key for any data type. If \p size is
/// greater than \p BlockSize, this function does nothing.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] size - custom size of block to be sorted.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void sort(Key& thread_key,
storage_type& storage,
const unsigned int size,
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_key, storage, size, compare_function);
}
/// \brief Block sort by key for any data type. If \p size is
/// greater than \p BlockSize, this function does nothing.
///
/// \tparam BinaryFunction - type of binary function used for sort. Default type
/// is rocprim::less<T>.
///
/// \param [in, out] thread_key - reference to a key provided by a thread.
/// \param [in, out] thread_value - reference to a value provided by a thread.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
/// \param [in] size - custom size of block to be sorted.
/// \param [in] compare_function - comparison function object which returns true if the
/// first argument is is ordered before the second.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
template<class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_INLINE
void sort(Key& thread_key,
Value& thread_value,
storage_type& storage,
const unsigned int size,
BinaryFunction compare_function = BinaryFunction())
{
base_type::sort(thread_key, thread_value, storage, size, compare_function);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_SORT_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_STORE_HPP_
#define ROCPRIM_BLOCK_BLOCK_STORE_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "block_store_func.hpp"
#include "block_exchange.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief \p block_store_method enumerates the methods available to store a striped arrangement
/// of items into a blocked/striped arrangement on continuous memory
enum class block_store_method
{
/// A blocked arrangement of items is stored into a blocked arrangement on continuous
/// memory.
/// \par Performance Notes:
/// * Performance decreases with increasing number of items per thread (stride
/// between reads), because of reduced memory coalescing.
block_store_direct,
/// A striped arrangement of items is stored into a blocked arrangement on continuous
/// memory.
block_store_striped,
/// A blocked arrangement of items is stored into a blocked arrangement on continuous
/// memory using vectorization as an optimization.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, provided that
/// vectorization requirements are fulfilled. Otherwise, performance will default
/// to \p block_store_direct.
/// \par Requirements:
/// * The output offset (\p block_output) must be quad-item aligned.
/// * The following conditions will prevent vectorization and switch to default
/// \p block_store_direct:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
block_store_vectorize,
/// A blocked arrangement of items is locally transposed and stored as a striped
/// arrangement of data on continuous memory.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_store_direct and
/// \p block_store_vectorize due to reordering on local memory.
block_store_transpose,
/// A blocked arrangement of items is locally transposed and stored as a warp-striped
/// arrangement of data on continuous memory.
/// \par Requirements:
/// * The number of threads in the block must be a multiple of the size of hardware warp.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_store_direct and
/// \p block_store_vectorize due to reordering on local memory.
block_store_warp_transpose,
/// Defaults to \p block_store_direct
default_method = block_store_direct
};
/// \brief The \p block_store class is a block level parallel primitive which provides methods
/// for storing an arrangement of items into a blocked/striped arrangement on continous memory.
///
/// \tparam T - the output/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items to be processed by
/// each thread.
/// \tparam Method - the method to store data.
///
/// \par Overview
/// * The \p block_store class has a number of different methods to store data:
/// * [block_store_direct](\ref ::block_store_method::block_store_direct)
/// * [block_store_striped](\ref ::block_store_method::block_store_striped)
/// * [block_store_vectorize](\ref ::block_store_method::block_store_vectorize)
/// * [block_store_transpose](\ref ::block_store_method::block_store_transpose)
/// * [block_store_warp_transpose](\ref ::block_store_method::block_store_warp_transpose)
///
/// \par Example:
/// \parblock
/// In the examples store operation is performed on block of 128 threads, using type
/// \p int and 8 items per thread.
///
/// \code{.cpp}
/// __global__ void kernel(int * output)
/// {
/// const int offset = blockIdx.x * 128 * 8;
/// int items[8];
/// rocprim::block_store<int, 128, 8, store_method> blockstore;
/// blockstore.store(output + offset, items);
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
block_store_method Method = block_store_method::block_store_direct,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_store
{
private:
using storage_type_ = typename ::rocprim::detail::empty_storage_type;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords \p __shared__. It can be aliased to
/// an externally allocated memory, or be a part of a union with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = typename ::rocprim::detail::empty_storage_type;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
/// \brief Stores an arrangement of items from across the thread block into an
/// arrangement on continuous memory.
///
/// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
/// pointer.
///
/// \param [out] block_output - the output iterator from the thread block to store to.
/// \param [in] items - array that data is read from.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread])
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_store_direct_blocked(flat_id, block_output, items);
}
/// \brief Stores an arrangement of items from across the thread block into an
/// arrangement on continuous memory, which is guarded by range \p valid.
///
/// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
/// pointer.
///
/// \param [out] block_output - the output iterator from the thread block to store to.
/// \param [in] items - array that data is read from.
/// \param [in] valid - maximum range of valid numbers to read.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_store_direct_blocked(flat_id, block_output, items, valid);
}
/// \brief Stores an arrangement of items from across the thread block into an
/// arrangement on continuous memory, using temporary storage.
///
/// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
/// pointer.
///
/// \param [out] block_output - the output iterator from the thread block to store to.
/// \param [in] items - array that data is read from.
/// \param [in] storage - temporary storage for outputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void kernel(...)
/// {
/// int items[8];
/// using block_store_int = rocprim::block_store<int, 128, 8>;
/// block_store_int bstore;
/// __shared__ typename block_store_int::storage_type storage;
/// bstore.store(..., items, storage);
/// ...
/// }
/// \endcode
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
storage_type& storage)
{
(void) storage;
store(block_output, items);
}
/// \brief Stores an arrangement of items from across the thread block into an
/// arrangement on continuous memory, which is guarded by range \p valid,
/// using temporary storage
///
/// \tparam OutputIterator - [inferred] an iterator type for output (can be a simple
/// pointer.
///
/// \param [out] block_output - the output iterator from the thread block to store to.
/// \param [in] items - array that data is read from.
/// \param [in] valid - maximum range of valid numbers to read.
/// \param [in] storage - temporary storage for outputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void kernel(...)
/// {
/// int items[8];
/// using block_store_int = rocprim::block_store<int, 128, 8>;
/// block_store_int bstore;
/// __shared__ typename block_store_int::storage_type storage;
/// bstore.store(..., items, valid, storage);
/// ...
/// }
/// \endcode
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
(void) storage;
store(block_output, items, valid);
}
};
/// @}
// end of group blockmodule
#ifndef DOXYGEN_SHOULD_SKIP_THIS
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_striped, BlockSizeY, BlockSizeZ>
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
private:
using storage_type_ = typename ::rocprim::detail::empty_storage_type;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = typename ::rocprim::detail::empty_storage_type;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
template<class OutputIterator>
ROCPRIM_DEVICE inline
void store(OutputIterator block_output,
T (&items)[ItemsPerThread])
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_store_direct_striped<BlockSize>(flat_id, block_output, items);
}
template<class OutputIterator>
ROCPRIM_DEVICE inline
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
}
template<class OutputIterator>
ROCPRIM_DEVICE inline
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
storage_type& storage)
{
(void) storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_store_direct_striped<BlockSize>(flat_id, block_output, items);
}
template<class OutputIterator>
ROCPRIM_DEVICE inline
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
(void) storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
}
};
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_vectorize, BlockSizeY, BlockSizeZ>
{
private:
using storage_type_ = typename ::rocprim::detail::empty_storage_type;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = typename ::rocprim::detail::empty_storage_type;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(T* block_output,
T (&_items)[ItemsPerThread])
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_store_direct_blocked_vectorized(flat_id, block_output, _items);
}
template<class OutputIterator, class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
U (&items)[ItemsPerThread])
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_store_direct_blocked(flat_id, block_output, items);
}
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_store_direct_blocked(flat_id, block_output, items, valid);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(T* block_output,
T (&items)[ItemsPerThread],
storage_type& storage)
{
(void) storage;
store(block_output, items);
}
template<class OutputIterator, class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
U (&items)[ItemsPerThread],
storage_type& storage)
{
(void) storage;
store(block_output, items);
}
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
(void) storage;
store(block_output, items, valid);
}
};
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_transpose, BlockSizeY, BlockSizeZ>
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
private:
using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;
public:
using storage_type = typename block_exchange_type::storage_type;
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_exchange_type().blocked_to_striped(items, items, storage);
block_store_direct_striped<BlockSize>(flat_id, block_output, items);
}
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_exchange_type().blocked_to_striped(items, items, storage);
block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
}
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_exchange_type().blocked_to_striped(items, items, storage);
block_store_direct_striped<BlockSize>(flat_id, block_output, items);
}
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_exchange_type().blocked_to_striped(items, items, storage);
block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
}
};
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_warp_transpose, BlockSizeY, BlockSizeZ>
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
private:
using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;
public:
static_assert(BlockSize % ::rocprim::device_warp_size() == 0,
"BlockSize must be a multiple of hardware warpsize");
using storage_type = typename block_exchange_type::storage_type;
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_exchange_type().blocked_to_warp_striped(items, items, storage);
block_store_direct_warp_striped(flat_id, block_output, items);
}
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_exchange_type().blocked_to_warp_striped(items, items, storage);
block_store_direct_warp_striped(flat_id, block_output, items, valid);
}
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_exchange_type().blocked_to_warp_striped(items, items, storage);
block_store_direct_warp_striped(flat_id, block_output, items);
}
template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void store(OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_exchange_type().blocked_to_warp_striped(items, items, storage);
block_store_direct_warp_striped(flat_id, block_output, items, valid);
}
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_BLOCK_BLOCK_STORE_HPP_
// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
#define ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup blockmodule
/// @{
/// \brief Stores a blocked arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
template<
class OutputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_store_direct_blocked(unsigned int flat_id,
OutputIterator block_output,
T (&items)[ItemsPerThread])
{
static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T.");
unsigned int offset = flat_id * ItemsPerThread;
OutputIterator thread_iter = block_output + offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
thread_iter[item] = items[item];
}
}
/// \brief Stores a blocked arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
/// \param valid - maximum range of valid numbers to store
template<
class OutputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_store_direct_blocked(unsigned int flat_id,
OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid)
{
static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T.");
unsigned int offset = flat_id * ItemsPerThread;
OutputIterator thread_iter = block_output + offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
if (item + offset < valid)
{
thread_iter[item] = items[item];
}
}
}
/// \brief Stores a blocked arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// The input offset (\p block_output + offset) must be quad-item aligned.
///
/// The following conditions will prevent vectorization and switch to default
/// block_load_direct_blocked:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
///
/// \tparam T - [inferred] the output data type
/// \tparam U - [inferred] the input data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// The type \p U must be such that it can be implicitly converted to \p T.
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template<
class T,
class U,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto
block_store_direct_blocked_vectorized(unsigned int flat_id,
T* block_output,
U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type
{
static_assert(std::is_convertible<U, T>::value,
"The type U must be such that it can be implicitly converted to T.");
typedef typename detail::match_vector_type<T, ItemsPerThread>::type vector_type;
constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type);
vector_type *vectors_ptr = reinterpret_cast<vector_type*>(const_cast<T*>(block_output));
vector_type raw_vector_items[vectors_per_thread];
T *raw_items = reinterpret_cast<T*>(raw_vector_items);
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
raw_items[item] = items[item];
}
block_store_direct_blocked(flat_id, vectors_ptr, raw_vector_items);
}
template<
class T,
class U,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto
block_store_direct_blocked_vectorized(unsigned int flat_id,
T* block_output,
U (&items)[ItemsPerThread]) -> typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>::value>::type
{
block_store_direct_blocked(flat_id, block_output, items);
}
/// \brief Stores a striped arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
template<
unsigned int BlockSize,
class OutputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_store_direct_striped(unsigned int flat_id,
OutputIterator block_output,
T (&items)[ItemsPerThread])
{
static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T.");
OutputIterator thread_iter = block_output + flat_id;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
thread_iter[item * BlockSize] = items[item];
}
}
/// \brief Stores a striped arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
/// \param valid - maximum range of valid numbers to store
template<
unsigned int BlockSize,
class OutputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_store_direct_striped(unsigned int flat_id,
OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid)
{
static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T.");
OutputIterator thread_iter = block_output + flat_id;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
unsigned int offset = item * BlockSize;
if (flat_id + offset < valid)
{
thread_iter[offset] = items[item];
}
}
}
/// \brief Stores a warp-striped arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
template<
unsigned int WarpSize = device_warp_size(),
class OutputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_store_direct_warp_striped(unsigned int flat_id,
OutputIterator block_output,
T (&items)[ItemsPerThread])
{
static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T.");
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
unsigned int thread_id = detail::logical_lane_id<WarpSize>();
unsigned int warp_id = flat_id / WarpSize;
unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
OutputIterator thread_iter = block_output + thread_id + warp_offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
thread_iter[item * WarpSize] = items[item];
}
}
/// \brief Stores a warp-striped arrangement of items from across the thread block
/// into a blocked arrangement on continuous memory, which is guarded by range \p valid.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to store a range of
/// \p ItemsPerThread \p items to the thread block.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam OutputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_output - the input iterator from the thread block to store to
/// \param items - array that data is stored to thread block
/// \param valid - maximum range of valid numbers to store
template<
unsigned int WarpSize = device_warp_size(),
class OutputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_store_direct_warp_striped(unsigned int flat_id,
OutputIterator block_output,
T (&items)[ItemsPerThread],
unsigned int valid)
{
static_assert(std::is_assignable<decltype(block_output[0]), T>::value,
"The type T must be such that an object of type OutputIterator "
"can be dereferenced and assigned a value of type T.");
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
unsigned int thread_id = detail::logical_lane_id<WarpSize>();
unsigned int warp_id = flat_id / WarpSize;
unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
OutputIterator thread_iter = block_output + thread_id + warp_offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
unsigned int offset = item * WarpSize;
if (warp_offset + thread_id + offset < valid)
{
thread_iter[offset] = items[item];
}
}
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_
#define ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_
#include "../../config.hpp"
#include "../../detail/various.hpp"
#include "../../intrinsics/thread.hpp"
#include <type_traits>
#include <cassert>
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
// Wrapping function that allows to call BinaryFunction of any of these signatures:
// with b_index (a, b, b_index) or without it (a, b).
// Only in the case of discontinuity (when flags_style is true) is the operator allowed to take an
// index
// block_discontinuity and block_adjacent difference only differ in their implementations by the
// order the operators parameters are passed, so this method deals with this as well
template <class T, class BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE auto apply(BinaryFunction op,
const T& a,
const T& b,
unsigned int index,
bool_constant<true> /*as_flags*/,
bool_constant<false> /*reversed*/) -> decltype(op(b, a, index))
{
return op(a, b, index);
}
template <class T, class BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE auto apply(BinaryFunction op,
const T& a,
const T& b,
unsigned int index,
bool_constant<true> /*as_flags*/,
bool_constant<true> /*reversed*/)
-> decltype(op(b, a, index))
{
return op(b, a, index);
}
template <typename T, typename BinaryFunction, bool AsFlags>
ROCPRIM_DEVICE ROCPRIM_INLINE auto apply(BinaryFunction op,
const T& a,
const T& b,
unsigned int,
bool_constant<AsFlags> /*as_flags*/,
bool_constant<false> /*reversed*/) -> decltype(op(b, a))
{
return op(a, b);
}
template <typename T, typename BinaryFunction, bool AsFlags>
ROCPRIM_DEVICE ROCPRIM_INLINE auto apply(BinaryFunction op,
const T& a,
const T& b,
unsigned int,
bool_constant<AsFlags> /*as_flags*/,
bool_constant<true> /*reversed*/) -> decltype(op(b, a))
{
return op(b, a);
}
template <typename T,
unsigned int BlockSizeX,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1>
class block_adjacent_difference_impl
{
public:
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
struct storage_type
{
T items[BlockSize];
};
template <bool AsFlags,
bool Reversed,
bool WithTilePredecessor,
unsigned int ItemsPerThread,
typename Output,
typename BinaryFunction>
ROCPRIM_DEVICE void apply_left(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
BinaryFunction op,
const T tile_predecessor_item,
storage_type& storage)
{
static constexpr auto as_flags = bool_constant<AsFlags> {};
static constexpr auto reversed = bool_constant<Reversed> {};
const unsigned int flat_id
= ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
// Save the last item of each thread
storage.items[flat_id] = input[ItemsPerThread - 1];
ROCPRIM_UNROLL
for(unsigned int i = ItemsPerThread - 1; i > 0; --i)
{
output[i] = detail::apply(
op, input[i - 1], input[i], flat_id * ItemsPerThread + i, as_flags, reversed);
}
::rocprim::syncthreads();
if ROCPRIM_IF_CONSTEXPR (WithTilePredecessor)
{
T predecessor_item = tile_predecessor_item;
if(flat_id != 0) {
predecessor_item = storage.items[flat_id - 1];
}
output[0] = detail::apply(
op, predecessor_item, input[0], flat_id * ItemsPerThread, as_flags, reversed);
}
else
{
output[0] = get_default_item(input, 0, as_flags);
if(flat_id != 0) {
output[0] = detail::apply(op,
storage.items[flat_id - 1],
input[0],
flat_id * ItemsPerThread,
as_flags,
reversed);
}
}
}
template <bool AsFlags,
bool Reversed,
bool WithTilePredecessor,
unsigned int ItemsPerThread,
typename Output,
typename BinaryFunction>
ROCPRIM_DEVICE void apply_left_partial(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
BinaryFunction op,
const T tile_predecessor_item,
const unsigned int valid_items,
storage_type& storage)
{
static constexpr auto as_flags = bool_constant<AsFlags> {};
static constexpr auto reversed = bool_constant<Reversed> {};
assert(valid_items <= BlockSize * ItemsPerThread);
const unsigned int flat_id
= ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
// Save the last item of each thread
storage.items[flat_id] = input[ItemsPerThread - 1];
ROCPRIM_UNROLL
for(unsigned int i = ItemsPerThread - 1; i > 0; --i)
{
const unsigned int index = flat_id * ItemsPerThread + i;
output[i] = get_default_item(input, i, as_flags);
if(index < valid_items) {
output[i] = detail::apply(op, input[i - 1], input[i], index, as_flags, reversed);
}
}
::rocprim::syncthreads();
const unsigned int index = flat_id * ItemsPerThread;
if ROCPRIM_IF_CONSTEXPR (WithTilePredecessor)
{
T predecessor_item = tile_predecessor_item;
if(flat_id != 0) {
predecessor_item = storage.items[flat_id - 1];
}
output[0] = get_default_item(input, 0, as_flags);
if(index < valid_items)
{
output[0]
= detail::apply(op, predecessor_item, input[0], index, as_flags, reversed);
}
}
else
{
output[0] = get_default_item(input, 0, as_flags);
if(flat_id != 0 && index < valid_items)
{
output[0] = detail::apply(op,
storage.items[flat_id - 1],
input[0],
flat_id * ItemsPerThread,
as_flags,
reversed);
}
}
}
template <bool AsFlags,
bool Reversed,
bool WithTileSuccessor,
unsigned int ItemsPerThread,
typename Output,
typename BinaryFunction>
ROCPRIM_DEVICE void apply_right(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
BinaryFunction op,
const T tile_successor_item,
storage_type& storage)
{
static constexpr auto as_flags = bool_constant<AsFlags> {};
static constexpr auto reversed = bool_constant<Reversed> {};
const unsigned int flat_id
= ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
// Save the first item of each thread
storage.items[flat_id] = input[0];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread - 1; ++i)
{
output[i] = detail::apply(
op, input[i], input[i + 1], flat_id * ItemsPerThread + i + 1, as_flags, reversed);
}
::rocprim::syncthreads();
if ROCPRIM_IF_CONSTEXPR (WithTileSuccessor)
{
T successor_item = tile_successor_item;
if(flat_id != BlockSize - 1) {
successor_item = storage.items[flat_id + 1];
}
output[ItemsPerThread - 1] = detail::apply(op,
input[ItemsPerThread - 1],
successor_item,
flat_id * ItemsPerThread + ItemsPerThread,
as_flags,
reversed);
}
else
{
output[ItemsPerThread - 1] = get_default_item(input, ItemsPerThread - 1, as_flags);
if(flat_id != BlockSize - 1) {
output[ItemsPerThread - 1]
= detail::apply(op,
input[ItemsPerThread - 1],
storage.items[flat_id + 1],
flat_id * ItemsPerThread + ItemsPerThread,
as_flags,
reversed);
}
}
}
template <bool AsFlags,
bool Reversed,
unsigned int ItemsPerThread,
typename Output,
typename BinaryFunction>
ROCPRIM_DEVICE void apply_right_partial(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
BinaryFunction op,
const unsigned int valid_items,
storage_type& storage)
{
static constexpr auto as_flags = bool_constant<AsFlags> {};
static constexpr auto reversed = bool_constant<Reversed> {};
assert(valid_items <= BlockSize * ItemsPerThread);
const unsigned int flat_id
= ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
// Save the first item of each thread
storage.items[flat_id] = input[0];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread - 1; ++i)
{
const unsigned int index = flat_id * ItemsPerThread + i + 1;
output[i] = get_default_item(input, i, as_flags);
if(index < valid_items)
{
output[i] = detail::apply(op, input[i], input[i + 1], index, as_flags, reversed);
}
}
::rocprim::syncthreads();
output[ItemsPerThread - 1] = get_default_item(input, ItemsPerThread - 1, as_flags);
const unsigned int next_thread_index = flat_id * ItemsPerThread + ItemsPerThread;
if(next_thread_index < valid_items)
{
output[ItemsPerThread - 1] = detail::apply(op,
input[ItemsPerThread - 1],
storage.items[flat_id + 1],
next_thread_index,
as_flags,
reversed);
}
}
private:
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE int get_default_item(const T (&)[ItemsPerThread],
unsigned int /*index*/,
bool_constant<true> /*as_flags*/)
{
return 1;
}
template <unsigned int ItemsPerThread>
ROCPRIM_DEVICE T get_default_item(const T (&input)[ItemsPerThread],
const unsigned int index,
bool_constant<false> /*as_flags*/)
{
return input[index];
}
};
} // namespace detail
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment