Commit f8a481f8 authored by zhouxiang's avatar zhouxiang
Browse files

添加dtk中的cub头文件

parent 7b7c64c5
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
#include <type_traits>
#include <iterator>
#include <chrono>
#include "device_reduce_config.hpp"
#include "../config.hpp"
#include "../functional.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "detail/device_segmented_reduce.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
template<
class Config,
class InputIterator,
class OutputIterator,
class OffsetIterator,
class ResultType,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
void segmented_reduce_kernel(InputIterator input,
OutputIterator output,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
BinaryFunction reduce_op,
ResultType initial_value)
{
segmented_reduce<Config>(
input, output,
begin_offsets, end_offsets,
reduce_op, initial_value
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template<
class Config,
class InputIterator,
class OutputIterator,
class OffsetIterator,
class InitValueType,
class BinaryFunction
>
inline
cudaError_t segmented_reduce_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
BinaryFunction reduce_op,
InitValueType initial_value,
cudaStream_t stream,
bool debug_synchronous)
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using result_type = typename ::rocprim::detail::match_result_type<
input_type, BinaryFunction
>::type;
// Get default config if Config is default_config
using config = default_or_custom_config<
Config,
default_reduce_config<ROCPRIM_TARGET_ARCH, result_type>
>;
constexpr unsigned int block_size = config::block_size;
if(temporary_storage == nullptr)
{
// Make sure user won't try to allocate 0 bytes memory, because
// cudaMalloc will return nullptr when size is zero.
storage_size = 4;
return cudaSuccess;
}
if( segments == 0u )
return cudaSuccess;
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
segmented_reduce_kernel<config>
<<<dim3(segments), dim3(block_size), 0, stream>>>(
input, output,
begin_offsets, end_offsets,
reduce_op, static_cast<result_type>(initial_value)
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_reduce", segments, start);
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
} // end of detail namespace
/// \brief Parallel segmented reduction primitive for device level.
///
/// segmented_reduce function performs a device-wide reduction operation across multiple sequences
/// using binary \p reduce_op operator.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input must have at least \p size elements, \p output must have
/// \p segments elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for reduction. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
/// \tparam InitValueType - type of the initial value.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the reduction operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to reduce.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] initial_value - initial value to start the reduction.
/// \param [in] reduce_op - binary operation function object that will be used for reduction.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented min-reduction operation is performed on an array of
/// integer values (<tt>short</tt>s are reduced into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom reduce function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// unsigned int segments; // e.g., 3
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 3 elements
/// int * offsets; // e.g. [0, 2, 3, 8]
/// int init_value; // e.g., 9
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output,
/// segments, offsets, offsets + 1,
/// min_op, init_value
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform segmented reduction
/// rocprim::segmented_reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output,
/// segments, offsets, offsets + 1,
/// min_op, init_value
/// );
/// // output: [4, 6, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class OffsetIterator,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>,
class InitValueType = typename std::iterator_traits<InputIterator>::value_type
>
inline
cudaError_t segmented_reduce(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
BinaryFunction reduce_op = BinaryFunction(),
InitValueType initial_value = InitValueType(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return detail::segmented_reduce_impl<Config>(
temporary_storage, storage_size,
input, output,
segments, begin_offsets, end_offsets,
reduce_op, initial_value,
stream, debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_
#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "../iterator/zip_iterator.hpp"
#include "../iterator/discard_iterator.hpp"
#include "../iterator/transform_iterator.hpp"
#include "../iterator/counting_iterator.hpp"
#include "../types/tuple.hpp"
#include "device_scan_config.hpp"
#include "device_scan.hpp"
#include "detail/device_segmented_scan.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
template<
bool Exclusive,
class Config,
class ResultType,
class InputIterator,
class OutputIterator,
class OffsetIterator,
class InitValueType,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
void segmented_scan_kernel(InputIterator input,
OutputIterator output,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
InitValueType initial_value,
BinaryFunction scan_op)
{
segmented_scan<Exclusive, Config, ResultType>(
input, output, begin_offsets, end_offsets,
static_cast<ResultType>(initial_value), scan_op
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template<
bool Exclusive,
class Config,
class InputIterator,
class OutputIterator,
class OffsetIterator,
class InitValueType,
class BinaryFunction
>
inline
cudaError_t segmented_scan_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
const InitValueType initial_value,
BinaryFunction scan_op,
cudaStream_t stream,
bool debug_synchronous)
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using result_type = typename std::conditional<Exclusive, InitValueType, input_type>::type;
// Get default config if Config is default_config
using config = default_or_custom_config<
Config,
default_scan_config<ROCPRIM_TARGET_ARCH, result_type>
>;
constexpr unsigned int block_size = config::block_size;
if(temporary_storage == nullptr)
{
// Make sure user won't try to allocate 0 bytes memory, because
// cudaMalloc will return nullptr when size is zero.
storage_size = 4;
return cudaSuccess;
}
if( segments == 0u )
return cudaSuccess;
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
segmented_scan_kernel<Exclusive, config, result_type>
<<<dim3(segments), dim3(block_size), 0, stream>>>(
input, output,
begin_offsets, end_offsets,
initial_value, scan_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_scan", segments, start);
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
} // end of detail namespace
/// \brief Parallel segmented inclusive scan primitive for device level.
///
/// segmented_inclusive_scan function performs a device-wide inclusive scan operation
/// across multiple sequences from \p input using binary \p scan_op operator.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p output must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented inclusive min-scan operation is performed on
/// an array of integer values (<tt>short</tt>s are scanned into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom scan function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 8 elements
/// size_t segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 4, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, segments, offsets, offsets + 1, min_op
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, segments, offsets, offsets + 1, min_op
/// );
/// // output: [4, 4, 6, 2, 5, 1, 1, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class OffsetIterator,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
>
inline
cudaError_t segmented_inclusive_scan(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
BinaryFunction scan_op = BinaryFunction(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using result_type = input_type;
return detail::segmented_scan_impl<false, Config>(
temporary_storage, storage_size,
input, output, segments, begin_offsets, end_offsets, result_type(),
scan_op, stream, debug_synchronous
);
}
/// \brief Parallel segmented exclusive scan primitive for device level.
///
/// segmented_exclusive_scan function performs a device-wide exclusive scan operation
/// across multiple sequences from \p input using binary \p scan_op operator.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p output must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] initial_value - initial value to start the scan.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented exclusive min-scan operation is performed on
/// an array of integer values (<tt>short</tt>s are scanned into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom scan function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// int start_value; // e.g., 9
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 8 elements
/// size_t segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 4, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, segments, offsets, offsets + 1
/// start_value, min_op
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, segments, offsets, offsets + 1
/// start_value, min_op
/// );
/// // output: [9, 4, 9, 6, 9, 5, 1, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class OffsetIterator,
class InitValueType,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
>
inline
cudaError_t segmented_exclusive_scan(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
const InitValueType initial_value,
BinaryFunction scan_op = BinaryFunction(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return detail::segmented_scan_impl<true, Config>(
temporary_storage, storage_size,
input, output, segments, begin_offsets, end_offsets, initial_value,
scan_op, stream, debug_synchronous
);
}
/// \brief Parallel segmented inclusive scan primitive for device level.
///
/// segmented_inclusive_scan function performs a device-wide inclusive scan operation
/// across multiple sequences from \p input using binary \p scan_op operator. Beginnings
/// of the segments should be marked by value convertible to \p true at corresponding
/// position in \p flags range.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements.
/// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam HeadFlagIterator - random-access iterator type of flags. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] head_flags - iterator to the first element in the range of head flags marking
/// beginnings of each segment in the input range.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented inclusive sum operation is performed on
/// an array of integer values (<tt>short</tt>s are added into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * flags; // e.g., [1, 0, 0, 1, 0, 1, 0, 0]
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, flags, size, ::rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, flags, size, ::rocprim::plus<int>()
/// );
/// // output: [1, 3, 6, 4, 9, 6, 13, 21]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class HeadFlagIterator,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
>
inline
cudaError_t segmented_inclusive_scan(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
HeadFlagIterator head_flags,
size_t size,
BinaryFunction scan_op = BinaryFunction(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using result_type = input_type;
using flag_type = typename std::iterator_traits<HeadFlagIterator>::value_type;
using headflag_scan_op_wrapper_type =
detail::headflag_scan_op_wrapper<
result_type, flag_type, BinaryFunction
>;
return inclusive_scan<Config>(
temporary_storage, storage_size,
rocprim::make_zip_iterator(rocprim::make_tuple(input, head_flags)),
rocprim::make_zip_iterator(rocprim::make_tuple(output, rocprim::make_discard_iterator())),
size, headflag_scan_op_wrapper_type(scan_op),
stream, debug_synchronous
);
}
/// \brief Parallel segmented exclusive scan primitive for device level.
///
/// segmented_exclusive_scan function performs a device-wide exclusive scan operation
/// across multiple sequences from \p input using binary \p scan_op operator. Beginnings
/// of the segments should be marked by value convertible to \p true at corresponding
/// position in \p flags range.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements.
/// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam HeadFlagIterator - random-access iterator type of flags. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] head_flags - iterator to the first element in the range of head flags marking
/// beginnings of each segment in the input range.
/// \param [in] initial_value - initial value to start the scan.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented exclusive sum operation is performed on
/// an array of integer values (<tt>short</tt>s are added into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * flags; // e.g., [1, 0, 0, 1, 0, 1, 0, 0]
/// int init; // e.g., 9
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, flags, init, size, ::rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, flags, init, size, ::rocprim::plus<int>()
/// );
/// // output: [9, 10, 12, 9, 13, 9, 15, 22]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class InitValueType,
class HeadFlagIterator,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
>
inline
cudaError_t segmented_exclusive_scan(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
HeadFlagIterator head_flags,
const InitValueType initial_value,
size_t size,
BinaryFunction scan_op = BinaryFunction(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using result_type = InitValueType;
using flag_type = typename std::iterator_traits<HeadFlagIterator>::value_type;
using headflag_scan_op_wrapper_type =
detail::headflag_scan_op_wrapper<
result_type, flag_type, BinaryFunction
>;
const result_type initial_value_converted = static_cast<result_type>(initial_value);
// Flag the last item of each segment as the next segment's head, use initial_value as its value,
// then run exclusive scan
return exclusive_scan<Config>(
temporary_storage, storage_size,
rocprim::make_transform_iterator(
rocprim::make_counting_iterator<size_t>(0),
[input, head_flags, initial_value_converted, size]
ROCPRIM_DEVICE
(const size_t i)
{
flag_type flag(false);
if(i + 1 < size)
{
flag = head_flags[i + 1];
}
result_type value = initial_value_converted;
if(!flag)
{
value = input[i];
}
return rocprim::make_tuple(value, flag);
}
),
rocprim::make_zip_iterator(rocprim::make_tuple(output, rocprim::make_discard_iterator())),
rocprim::make_tuple(initial_value_converted, flag_type(true)), // init value is a head of the first segment
size,
headflag_scan_op_wrapper_type(scan_op),
stream,
debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
#define ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/binary_op_wrappers.hpp"
#include "../iterator/transform_iterator.hpp"
#include "device_scan.hpp"
#include "device_partition.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
if(error != cudaSuccess) return error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
} \
}
} // end detail namespace
/// \brief Parallel select primitive for device level using range of flags.
///
/// Performs a device-wide selection based on input \p flags. If a value from \p input
/// should be selected and copied into \p output range the corresponding item from
/// \p flags range should be set to such value that can be implicitly converted to
/// \p true (\p bool type).
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p flags must have at least \p size elements.
/// * Range specified by \p output must have at least so many elements, that all positively
/// flagged values can be copied into it.
/// * Range specified by \p selected_count_output must have at least 1 element.
/// * Values of \p flag range should be implicitly convertible to `bool` type.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam FlagIterator - random-access iterator type of the flag range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [in] flags - iterator to the selection flag corresponding to the first element from \p input range.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level select operation is performed on an array of
/// integer values with array of <tt>char</tt>s used as flags.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// char * flags; // e.g., [0, 1, 1, 0, 0, 1, 0, 1]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::select(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, flags,
/// output, output_count,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform selection
/// rocprim::select(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, flags,
/// output, output_count,
/// input_size
/// );
/// // output: [2, 3, 6, 8]
/// // output_count: 4
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class FlagIterator,
class OutputIterator,
class SelectedCountOutputIterator
>
inline
cudaError_t select(void * temporary_storage,
size_t& storage_size,
InputIterator input,
FlagIterator flags,
OutputIterator output,
SelectedCountOutputIterator selected_count_output,
const size_t size,
const cudaStream_t stream = 0,
const bool debug_synchronous = false)
{
// Dummy unary predicate
using unary_predicate_type = ::rocprim::empty_type;
// Dummy inequality operation
using inequality_op_type = ::rocprim::empty_type;
using offset_type = unsigned int;
rocprim::empty_type* const no_values = nullptr; // key only
return detail::partition_impl<detail::select_method::flag, true, Config, offset_type>(
temporary_storage, storage_size, input, no_values, flags, output, no_values, selected_count_output,
size, inequality_op_type(), stream, debug_synchronous, unary_predicate_type()
);
}
/// \brief Parallel select primitive for device level using selection operator.
///
/// Performs a device-wide selection using selection operator. If a value \p x from \p input
/// should be selected and copied into \p output range, then <tt>predicate(x)</tt> has to
/// return \p true.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Range specified by \p input must have at least \p size elements.
/// * Range specified by \p output must have at least so many elements, that all selected
/// values can be copied into it.
/// * Range specified by \p selected_count_output must have at least 1 element.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
/// \tparam UnaryPredicate - type of a unary selection predicate.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] predicate - unary function object that will be used for selecting values.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level select operation is performed on an array of
/// integer values, only even values are selected.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// auto predicate =
/// [] __device__ (int a) -> bool
/// {
/// return (a%2) == 0;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::select(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, output_count,
/// predicate, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform selection
/// rocprim::select(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, output_count,
/// predicate, input_size
/// );
/// // output: [2, 4, 6, 8]
/// // output_count: 4
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class SelectedCountOutputIterator,
class UnaryPredicate
>
inline
cudaError_t select(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
SelectedCountOutputIterator selected_count_output,
const size_t size,
UnaryPredicate predicate,
const cudaStream_t stream = 0,
const bool debug_synchronous = false)
{
// Dummy flag type
using flag_type = ::rocprim::empty_type;
using offset_type = unsigned int;
flag_type * flags = nullptr;
// Dummy inequality operation
using inequality_op_type = ::rocprim::empty_type;
rocprim::empty_type* const no_values = nullptr; // key only
return detail::partition_impl<detail::select_method::predicate, true, Config, offset_type>(
temporary_storage, storage_size, input, no_values, flags, output, no_values, selected_count_output,
size, inequality_op_type(), stream, debug_synchronous, predicate
);
}
/// \brief Device-level parallel unique primitive.
///
/// From given \p input range unique primitive eliminates all but the first element from every
/// consecutive group of equivalent elements and copies them into \p output.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage is a null pointer.
/// * Range specified by \p input must have at least \p size elements.
/// * Range specified by \p output must have at least so many elements, that all selected
/// values can be copied into it.
/// * Range specified by \p unique_count_output must have at least 1 element.
/// * By default <tt>InputIterator::value_type</tt>'s equality operator is used to check
/// if elements are equivalent.
///
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam UniqueCountOutputIterator - random-access iterator type of the unique_count_output
/// value used to return number of unique values. It can be a simple pointer type.
/// \tparam EqualityOp - type of an binary operator used to compare values for equality.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the unique operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] unique_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] equality_op - [optional] binary function object used to compare input values for equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool equal_to(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level unique operation is performed on an array of integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 4, 2, 4, 4, 7, 7, 7]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::unique(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, output_count,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform unique operation
/// rocprim::unique(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, output_count,
/// input_size
/// );
/// // output: [1, 4, 2, 4, 7]
/// // output_count: 5
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class UniqueCountOutputIterator,
class EqualityOp = ::rocprim::equal_to<typename std::iterator_traits<InputIterator>::value_type>
>
inline
cudaError_t unique(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
UniqueCountOutputIterator unique_count_output,
const size_t size,
EqualityOp equality_op = EqualityOp(),
const cudaStream_t stream = 0,
const bool debug_synchronous = false)
{
// Dummy unary predicate
using unary_predicate_type = ::rocprim::empty_type;
using offset_type = unsigned int;
// Dummy flag type
using flag_type = ::rocprim::empty_type;
const flag_type * flags = nullptr;
rocprim::empty_type* const no_values = nullptr; // key only
// Convert equality operator to inequality operator
auto inequality_op = detail::inequality_wrapper<EqualityOp>(equality_op);
return detail::partition_impl<detail::select_method::unique, true, Config, offset_type>(
temporary_storage, storage_size, input, no_values, flags, output, no_values, unique_count_output,
size, inequality_op, stream, debug_synchronous, unary_predicate_type()
);
}
/// \brief Device-level parallel unique by key primitive.
///
/// From given \p input range unique primitive eliminates all but the first element from every
/// consecutive group of equivalent elements and copies them and their corresponding keys into
/// \p output.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage is a null pointer.
/// * Ranges specified by \p keys_input and value_input must have at least \p size elements each.
/// * Ranges specified by \p keys_output and values_output each must have at least so many elements,
/// that all selected values can be copied into them.
/// * Range specified by \p unique_count_output must have at least 1 element.
/// * By default <tt>InputIterator::value_type</tt>'s equality operator is used to check
/// if elements are equivalent.
///
/// \tparam KeyIterator - random-access iterator type of the input key range. It can be
/// a simple pointer type.
/// \tparam ValueIterator - random-access iterator type of the input value range. It can be
/// a simple pointer type.
/// \tparam OutputKeyIterator - random-access iterator type of the output key range. It can be
/// a simple pointer type.
/// \tparam OutputValueIterator - random-access iterator type of the output value range. It can be
/// a simple pointer type.
/// \tparam UniqueCountOutputIterator - random-access iterator type of the unique_count_output
/// value used to return number of unique keys and values. It can be a simple pointer type.
/// \tparam EqualityOp - type of an binary operator used to compare keys for equality.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the unique operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - iterator to the first element in the range to select keys from.
/// \param [in] values_input - iterator to the first element in the range of values corresponding to keys
/// \param [out] keys_output - iterator to the first element in the output key range.
/// \param [out] values_output - iterator to the first element in the output value range.
/// \param [out] unique_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] equality_op - [optional] binary function object used to compare input values for equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool equal_to(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
template <typename Config = default_config,
typename KeyIterator,
typename ValueIterator,
typename OutputKeyIterator,
typename OutputValueIterator,
typename UniqueCountOutputIterator,
typename EqualityOp
= ::rocprim::equal_to<typename std::iterator_traits<KeyIterator>::value_type>>
inline cudaError_t unique_by_key(void* temporary_storage,
size_t& storage_size,
const KeyIterator keys_input,
const ValueIterator values_input,
const OutputKeyIterator keys_output,
const OutputValueIterator values_output,
const UniqueCountOutputIterator unique_count_output,
const size_t size,
const EqualityOp equality_op = EqualityOp(),
const cudaStream_t stream = 0,
const bool debug_synchronous = false)
{
using offset_type = unsigned int;
// Dummy flag
::rocprim::empty_type* const no_flags = nullptr;
// Dummy predicate
const auto no_predicate = ::rocprim::empty_type{};
// Convert equality operator to inequality operator
const auto inequality_op = detail::inequality_wrapper<EqualityOp>(equality_op);
return detail::partition_impl<detail::select_method::unique, true, Config, offset_type>(
temporary_storage,
storage_size,
keys_input,
values_input,
no_flags,
keys_output,
values_output,
unique_count_output,
size,
inequality_op,
stream,
debug_synchronous,
no_predicate);
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SELECT_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_SELECT_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../block/block_load.hpp"
#include "../block/block_scan.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level select operation.
///
/// \tparam BlockSize - number of threads in a block.
/// \tparam ItemsPerThread - number of items processed by each thread.
/// \tparam KeyBlockLoadMethod - method for loading input keys.
/// \tparam ValueBlockLoadMethod - method for loading input values.
/// \tparam FlagBlockLoadMethod - method for loading flag values.
/// \tparam BlockScanMethod - algorithm for block scan.
/// \tparam SizeLimit - limit on the number of items for a single select kernel launch.
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
::rocprim::block_load_method KeyBlockLoadMethod,
::rocprim::block_load_method ValueBlockLoadMethod,
::rocprim::block_load_method FlagBlockLoadMethod,
::rocprim::block_scan_algorithm BlockScanMethod,
unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT
>
struct select_config
{
/// \brief Number of threads in a block.
static constexpr unsigned int block_size = BlockSize;
/// \brief Number of items processed by each thread.
static constexpr unsigned int items_per_thread = ItemsPerThread;
/// \brief Method for loading input keys.
static constexpr block_load_method key_block_load_method = KeyBlockLoadMethod;
/// \brief Method for loading input values.
static constexpr block_load_method value_block_load_method = ValueBlockLoadMethod;
/// \brief Method for loading flag values.
static constexpr block_load_method flag_block_load_method = FlagBlockLoadMethod;
/// \brief Algorithm for block scan.
static constexpr block_scan_algorithm block_scan_method = BlockScanMethod;
/// \brief Limit on the number of items for a single select kernel launch.
static constexpr unsigned int size_limit = SizeLimit;
};
namespace detail
{
template<class Key>
struct select_config_803
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
using type = select_config<
limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 13u / item_scale),
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<class Key>
struct select_config_900
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
using type = select_config<
limit_block_size<256U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 15u / item_scale),
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<class Value>
struct select_config_90a
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = select_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 15u / item_scale),
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<class Value>
struct select_config_1030
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = select_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
::rocprim::max(1u, 15u / item_scale),
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<unsigned int TargetArch, class Key, class /*Value*/>
struct default_select_config
: select_arch<
TargetArch,
select_arch_case<803, select_config_803<Key>>,
select_arch_case<900, select_config_900<Key>>,
select_arch_case<ROCPRIM_ARCH_90a, select_config_90a<Key>>,
select_arch_case<1030, select_config_1030<Key>>,
select_config_803<Key>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_SELECT_CONFIG_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
#define ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
#include <algorithm>
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "../types/tuple.hpp"
#include "../iterator/zip_iterator.hpp"
#include "device_transform_config.hpp"
#include "detail/device_transform.hpp"
#include <chrono>
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
class ResultType,
class InputIterator,
class OutputIterator,
class UnaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void transform_kernel(InputIterator input,
const size_t size,
OutputIterator output,
UnaryFunction transform_op)
{
transform_kernel_impl<BlockSize, ItemsPerThread, ResultType>(
input, size, output, transform_op
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
_error = cudaStreamSynchronize(stream); \
if(_error != cudaSuccess) return _error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
} // end of detail namespace
/// \brief Parallel transform primitive for device level.
///
/// transform function performs a device-wide transformation operation
/// using unary \p transform_op operator.
///
/// \par Overview
/// * Ranges specified by \p input and \p output must have at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam UnaryFunction - type of unary function used for transform.
///
/// \param [in] input - iterator to the first element in the range to transform.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] transform_op - unary operation function object that will be used for transform.
/// The signature of the function should be equivalent to the following:
/// <tt>U f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level transform operation is performed on an array of
/// integer values (<tt>short</tt>s are transformed into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom transform function
/// auto transform_op =
/// [] __device__ (int a) -> int
/// {
/// return a + 5;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 8 elements
///
/// // perform transform
/// rocprim::transform(
/// input, output, input_size, transform_op
/// );
/// // output: [6, 7, 8, 9, 10, 11, 12, 13]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class UnaryFunction
>
inline
cudaError_t transform(InputIterator input,
OutputIterator output,
const size_t size,
UnaryFunction transform_op,
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
if( size == size_t(0) )
return cudaSuccess;
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using result_type = typename ::rocprim::detail::invoke_result<UnaryFunction, input_type>::type;
// Get default config if Config is default_config
using config = detail::default_or_custom_config<
Config,
detail::default_transform_config<ROCPRIM_TARGET_ARCH, result_type>
>;
static constexpr unsigned int block_size = config::block_size;
static constexpr unsigned int items_per_thread = config::items_per_thread;
static constexpr auto items_per_block = block_size * items_per_thread;
// Start point for time measurements
std::chrono::high_resolution_clock::time_point start;
static constexpr auto size_limit = config::size_limit;
static constexpr auto number_of_blocks_limit
= ::rocprim::max<size_t>(size_limit / items_per_block, 1);
auto number_of_blocks = (size + items_per_block - 1)/items_per_block;
if(debug_synchronous)
{
std::cout << "block_size " << block_size << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "number of blocks limit " << number_of_blocks_limit << '\n';
std::cout << "items_per_block " << items_per_block << '\n';
}
static constexpr auto aligned_size_limit = number_of_blocks_limit * items_per_block;
// Launch number_of_blocks_limit blocks while there is still at least as many blocks left as the limit
const auto number_of_launch = (size + aligned_size_limit - 1) / aligned_size_limit;
for(size_t i = 0, offset = 0; i < number_of_launch; ++i, offset += aligned_size_limit) {
const auto current_size = std::min(size - offset, aligned_size_limit);
const auto current_blocks = (current_size + items_per_block - 1) / items_per_block;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
detail::transform_kernel<
block_size, items_per_thread, result_type,
InputIterator, OutputIterator, UnaryFunction
>
<<<dim3(current_blocks), dim3(block_size), 0, stream>>>(
input + offset, current_size, output + offset, transform_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("transform_kernel", current_size, start);
}
return cudaSuccess;
}
/// \brief Parallel device-level transform primitive for two inputs.
///
/// transform function performs a device-wide transformation operation
/// on two input ranges using binary \p transform_op operator.
///
/// \par Overview
/// * Ranges specified by \p input1, \p input2, and \p output must have at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or
/// a custom class with the same members.
/// \tparam InputIterator1 - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam InputIterator2 - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for transform.
///
/// \param [in] input1 - iterator to the first element in the 1st range to transform.
/// \param [in] input2 - iterator to the first element in the 2nd range to transform.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] transform_op - binary operation function object that will be used for transform.
/// The signature of the function should be equivalent to the following:
/// <tt>U f(const T1& a, const T2& b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced. Default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level transform operation is performed on two arrays of
/// integer values (element-wise sum is performed).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom transform function
/// auto transform_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a + b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// int* input1; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int* input2; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int* output; // empty array of 8 elements
///
/// // perform transform
/// rocprim::transform(
/// input1, input2, output, input1.size(), transform_op
/// );
/// // output: [2, 4, 6, 8, 10, 12, 14, 16]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator1,
class InputIterator2,
class OutputIterator,
class BinaryFunction
>
inline
cudaError_t transform(InputIterator1 input1,
InputIterator2 input2,
OutputIterator output,
const size_t size,
BinaryFunction transform_op,
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using value_type1 = typename std::iterator_traits<InputIterator1>::value_type;
using value_type2 = typename std::iterator_traits<InputIterator2>::value_type;
return transform<Config>(
::rocprim::make_zip_iterator(::rocprim::make_tuple(input1, input2)), output,
size, detail::unpack_binary_op<value_type1, value_type2, BinaryFunction>(transform_op),
stream, debug_synchronous
);
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
// Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_TRANSFORM_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_TRANSFORM_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../functional.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level transform primitives.
template <unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
using transform_config = kernel_config<BlockSize, ItemsPerThread, SizeLimit>;
namespace detail
{
template<class Value>
struct transform_config_803
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>;
};
template<class Value>
struct transform_config_900
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>;
};
template<class Value>
struct transform_config_90a
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>;
};
template<class Value>
struct transform_config_1030
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>;
};
template<unsigned int TargetArch, class Value>
struct default_transform_config
: select_arch<
TargetArch,
select_arch_case<803, transform_config_803<Value>>,
select_arch_case<900, transform_config_900<Value>>,
select_arch_case<ROCPRIM_ARCH_90a, transform_config_90a<Value>>,
select_arch_case<1030, transform_config_1030<Value>>,
transform_config_900<Value>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_TRANSFORM_CONFIG_HPP_
// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_MERGE_SORT_HPP_
#define ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_MERGE_SORT_HPP_
#include "../detail/device_radix_sort.hpp"
#include "../specialization/device_radix_single_sort.hpp"
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void radix_block_merge_kernel(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
const size_t input_size,
const unsigned int merge_items_per_block_size,
BinaryFunction compare_function)
{
radix_block_merge_impl<BlockSize, ItemsPerThread>(
keys_input, keys_output,
values_input, values_output,
input_size, merge_items_per_block_size,
compare_function
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_merge(KeysInputIterator keys_input,
typename std::iterator_traits<KeysInputIterator>::value_type * keys_buffer,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
typename std::iterator_traits<ValuesInputIterator>::value_type * values_buffer,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
constexpr unsigned int items_per_thread = Config::sort_merge::items_per_thread;
constexpr unsigned int block_size = Config::sort_merge::block_size;
constexpr unsigned int items_per_block = block_size * items_per_thread;
const unsigned int current_radix_bits = end_bit - bit;
auto number_of_blocks = (size + items_per_block - 1) / items_per_block;
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous)
{
std::cout << "block size " << block_size << '\n';
std::cout << "items per thread " << items_per_thread << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "bit " << bit << '\n';
std::cout << "current_radix_bits " << current_radix_bits << '\n';
}
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
sort_single_kernel<
block_size, items_per_thread , Descending
>
<<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
keys_input, keys_buffer, values_input, values_buffer,
size, bit, current_radix_bits
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("radix_sort_single", size, start)
bool temporary_store = true;
for(unsigned int block = items_per_block; block < size; block *= 2)
{
temporary_store = !temporary_store;
if(temporary_store)
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
if( current_radix_bits == sizeof(key_type) * 8 )
{
radix_block_merge_kernel<block_size, items_per_thread>
<<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
keys_output, keys_buffer, values_output, values_buffer,
size, block, radix_merge_compare<Descending, false, key_type>()
);
}
else
{
radix_block_merge_kernel<block_size, items_per_thread>
<<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
keys_output, keys_buffer, values_output, values_buffer,
size, block, radix_merge_compare<Descending, true, key_type>(bit, current_radix_bits)
);
}
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("radix_block_merge_kernel", size, start);
}
else
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
if( current_radix_bits == sizeof(key_type) * 8 )
{
radix_block_merge_kernel<block_size, items_per_thread>
<<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
keys_buffer, keys_output, values_buffer, values_output,
size, block, radix_merge_compare<Descending, false, key_type>()
);
}
else
{
radix_block_merge_kernel<block_size, items_per_thread>
<<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
keys_buffer, keys_output, values_buffer, values_output,
size, block, radix_merge_compare<Descending, true, key_type>(bit, current_radix_bits)
);
}
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("radix_block_merge_kernel", size, start);
}
}
if(temporary_store)
{
cudaError_t error = ::rocprim::transform(
keys_buffer, keys_output, size,
::rocprim::identity<key_type>(), stream, debug_synchronous
);
if(error != cudaSuccess) return error;
if(with_values)
{
cudaError_t error = ::rocprim::transform(
values_buffer, values_output, size,
::rocprim::identity<value_type>(), stream, debug_synchronous
);
if(error != cudaSuccess) return error;
}
}
return cudaSuccess;
}
} // end namespace detail
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_MERGE_SORT_HPP_
// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_SINGLE_SORT_HPP_
#define ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_SINGLE_SORT_HPP_
#include "../detail/device_radix_sort.hpp"
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void sort_single_kernel(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int current_radix_bits)
{
sort_single<BlockSize, ItemsPerThread, Descending>(
keys_input, keys_output,
values_input, values_output,
size, bit, current_radix_bits
);
}
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
const unsigned int current_radix_bits = end_bit - bit;
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous)
{
std::cout << "BlockSize " << BlockSize << '\n';
std::cout << "ItemsPerThread " << ItemsPerThread << '\n';
std::cout << "bit " << bit << '\n';
std::cout << "current_radix_bits " << current_radix_bits << '\n';
}
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
sort_single_kernel<
BlockSize, ItemsPerThread, Descending
>
<<<dim3(1), dim3(BlockSize), 0, stream>>>(
keys_input, keys_output, values_input, values_output,
size, bit, current_radix_bits
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("radix_sort_single", size, start)
return cudaSuccess;
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit64(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
return radix_sort_single<64U, 1U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit128(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 64U )
return radix_sort_single_limit64<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<64U, 2U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit192(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 128U )
return radix_sort_single_limit128<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<64U, 3U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit256(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 192U )
return radix_sort_single_limit192<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<64U, 4U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit320(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 256U )
return radix_sort_single_limit256<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<64U, 5U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit512(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 320U )
return radix_sort_single_limit320<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<256U, 2U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit768(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 512U )
return radix_sort_single_limit512<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<256U, 3U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit1024(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 768U )
return radix_sort_single_limit768<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<256U, 4U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit1536(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 1024U )
return radix_sort_single_limit1024<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<256U, 6U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit2048(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 1536U )
return radix_sort_single_limit1536<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<256U, 8U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit2560(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 2048U )
return radix_sort_single_limit2048<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<256U, 10U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit3072(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 2560U )
return radix_sort_single_limit2560<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<256U, 12U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit3584(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 3072U )
return radix_sort_single_limit3072<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<256U, 14U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_limit4096(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
if( !Config::force_single_kernel_config && size <= 3584U )
return radix_sort_single_limit3584<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<256U, 16U, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 64U,
cudaError_t
>::type
{
return radix_sort_single_limit64<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 64U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 128U,
cudaError_t
>::type
{
return radix_sort_single_limit128<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 128U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 192U,
cudaError_t
>::type
{
return radix_sort_single_limit192<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 192U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 256U,
cudaError_t
>::type
{
return radix_sort_single_limit256<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 256U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 320U,
cudaError_t
>::type
{
return radix_sort_single_limit320<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 320U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 512U,
cudaError_t
>::type
{
return radix_sort_single_limit512<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 512U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 768U,
cudaError_t
>::type
{
return radix_sort_single_limit768<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 768U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 1024U,
cudaError_t
>::type
{
return radix_sort_single_limit1024<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 1024U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 1536U,
cudaError_t
>::type
{
return radix_sort_single_limit1536<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 1536U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 2048U,
cudaError_t
>::type
{
return radix_sort_single_limit2048<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 2048U) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 2560U,
cudaError_t
>::type
{
return radix_sort_single_limit2560<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 2560) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 3072,
cudaError_t
>::type
{
return radix_sort_single_limit3072<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 3072) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 3584,
cudaError_t
>::type
{
return radix_sort_single_limit3584<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 3584) &&
Config::sort_single::items_per_thread * Config::sort_single::block_size <= 4096,
cudaError_t
>::type
{
return radix_sort_single_limit4096<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
auto radix_sort_single(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<
(Config::sort_single::items_per_thread * Config::sort_single::block_size > 4096),
cudaError_t
>::type
{
if( size < 4096 )
return radix_sort_single_limit4096<Config, Descending>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
else
return radix_sort_single<
Config::sort_single::block_size,
Config::sort_single::items_per_thread,
Descending
>(
keys_input, keys_output, values_input, values_output,
size, bit, end_bit, stream, debug_synchronous
);
}
} // end namespace detail
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_SINGLE_SORT_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_FUNCTIONAL_HPP_
#define ROCPRIM_FUNCTIONAL_HPP_
#include <functional>
// Meta configuration for rocPRIM
#include "config.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup utilsmodule_functional
/// @{
#define ROCPRIM_PRINT_ERROR_ONCE(message) \
{ \
unsigned int idx = threadIdx.x + (blockIdx.x * blockDim.x); \
idx += threadIdx.y + (blockIdx.y * blockDim.y); \
idx += threadIdx.z + (blockIdx.z * blockDim.z); \
if (idx == 0) \
printf("%s\n", #message); \
}
template<class T>
ROCPRIM_HOST_DEVICE inline
constexpr T max(const T& a, const T& b)
{
return a < b ? b : a;
}
template<class T>
ROCPRIM_HOST_DEVICE inline
constexpr T min(const T& a, const T& b)
{
return a < b ? a : b;
}
template<class T>
ROCPRIM_HOST_DEVICE inline
void swap(T& a, T& b)
{
T c = a;
a = b;
b = c;
}
template<class T = void>
struct less
{
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a < b;
}
};
template<>
struct less<void>
{
template<class T, class U>
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const U& b) const
{
return a < b;
}
};
template<class T = void>
struct less_equal
{
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a <= b;
}
};
template<>
struct less_equal<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a <= b;
}
};
template<class T = void>
struct greater
{
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a > b;
}
};
template<>
struct greater<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a > b;
}
};
template<class T = void>
struct greater_equal
{
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a >= b;
}
};
template<>
struct greater_equal<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a >= b;
}
};
template<class T = void>
struct equal_to
{
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a == b;
}
};
template<>
struct equal_to<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a == b;
}
};
template<class T = void>
struct not_equal_to
{
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a != b;
}
};
template<>
struct not_equal_to<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr bool operator()(const T& a, const T& b) const
{
return a != b;
}
};
template<class T = void>
struct plus
{
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a + b;
}
};
template<>
struct plus<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a + b;
}
};
template<class T = void>
struct minus
{
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a - b;
}
};
template<>
struct minus<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a - b;
}
};
template<class T = void>
struct multiplies
{
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a * b;
}
};
template<>
struct multiplies<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a * b;
}
};
template<class T = void>
struct maximum
{
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a < b ? b : a;
}
};
template<>
struct maximum<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a < b ? b : a;
}
};
template<class T = void>
struct minimum
{
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a < b ? a : b;
}
};
template<>
struct minimum<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a, const T& b) const
{
return a < b ? a : b;
}
};
template<class T = void>
struct identity
{
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a) const
{
return a;
}
};
template<>
struct identity<void>
{
template <typename T>
ROCPRIM_HOST_DEVICE inline
constexpr T operator()(const T& a) const
{
return a;
}
};
/**
* \brief Statically determine log2(N), rounded up.
*
* For example:
* Log2<8>::VALUE // 3
* Log2<3>::VALUE // 2
*/
template <int N, int CURRENT_VAL = N, int COUNT = 0>
struct Log2
{
/// Static logarithm value
enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE }; // Inductive case
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <int N, int COUNT>
struct Log2<N, 0, COUNT>
{
enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case
COUNT :
COUNT - 1 };
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Conditional types
******************************************************************************/
/**
* \brief Type equality test
*/
template <typename A, typename B>
struct Equals
{
enum {
VALUE = 0,
NEGATE = 1
};
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename A>
struct Equals <A, A>
{
enum {
VALUE = 1,
NEGATE = 0
};
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
template <int A>
struct Int2Type
{
enum {VALUE = A};
};
/// @}
// end of group utilsmodule_functional
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_FUNCTIONAL_HPP_
// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_HPP_
#define ROCPRIM_INTRINSICS_HPP_
// Meta configuration for rocPRIM
#include "config.hpp"
#include "intrinsics/atomic.hpp"
#include "intrinsics/bit.hpp"
#include "intrinsics/thread.hpp"
#include "intrinsics/warp.hpp"
#include "intrinsics/warp_shuffle.hpp"
#endif // ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_ATOMIC_HPP_
#define ROCPRIM_INTRINSICS_ATOMIC_HPP_
#include "../config.hpp"
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int atomic_add(unsigned int * address, unsigned int value)
{
return ::atomicAdd(address, value);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
int atomic_add(int * address, int value)
{
return ::atomicAdd(address, value);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
float atomic_add(float * address, float value)
{
return ::atomicAdd(address, value);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned long long atomic_add(unsigned long long * address, unsigned long long value)
{
return ::atomicAdd(address, value);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int atomic_wrapinc(unsigned int * address, unsigned int value)
{
return ::atomicInc(address, value);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int atomic_exch(unsigned int * address, unsigned int value)
{
return ::atomicExch(address, value);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned long long atomic_exch(unsigned long long * address, unsigned long long value)
{
return ::atomicExch(address, value);
}
}
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_ATOMIC_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_BIT_HPP_
#define ROCPRIM_INTRINSICS_BIT_HPP_
#include "../config.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup intrinsicsmodule
/// @{
/// \brief Returns a single bit at 'i' from 'x'
ROCPRIM_DEVICE ROCPRIM_INLINE
int get_bit(int x, int i)
{
return (x >> i) & 1;
}
/// \brief Bit count
///
/// Returns the number of bit of \p x set.
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int bit_count(unsigned int x)
{
return __popc(x);
}
/// \brief Bit count
///
/// Returns the number of bit of \p x set.
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int bit_count(unsigned long long x)
{
return __popcll(x);
}
/// @}
// end of group intrinsicsmodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_BIT_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_THREAD_HPP_
#define ROCPRIM_INTRINSICS_THREAD_HPP_
#include <atomic>
#include "../config.hpp"
#include "../detail/various.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup intrinsicsmodule
/// @{
// Sizes
/// \brief [DEPRECATED] Returns a number of threads in a hardware warp.
///
/// It is constant for a device.
/// This function is not supported for the gfx1030 architecture and will be removed in a future release.
/// Please use the new host_warp_size() and device_warp_size() functions.
ROCPRIM_HOST_DEVICE inline
constexpr unsigned int warp_size()
{
return warpSize;
}
/// \brief Returns a number of threads in a hardware warp for the actual device.
/// At host side this constant is available at runtime time only.
///
/// It is constant for a device.
ROCPRIM_HOST inline
unsigned int host_warp_size()
{
int default_hip_device;
cudaError_t success = cudaGetDevice(&default_hip_device);
cudaDeviceProp device_prop;
success = cudaGetDeviceProperties(&device_prop,default_hip_device);
if(success != cudaSuccess)
return -1;
else
return device_prop.warpSize;
};
/// \brief Returns a number of threads in a hardware warp for the actual target.
/// At device side this constant is available at compile time.
///
/// It is constant for a device.
ROCPRIM_DEVICE ROCPRIM_INLINE
constexpr unsigned int device_warp_size()
{
return warpSize;
}
/// \brief Returns flat size of a multidimensional block (tile).
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int flat_block_size()
{
return blockDim.z * blockDim.y * blockDim.x;
}
/// \brief Returns flat size of a multidimensional tile (block).
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int flat_tile_size()
{
return flat_block_size();
}
// IDs
/// \brief Returns thread identifier in a warp.
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int lane_id()
{
#ifndef __HIP_CPU_RT__
return ::__lane_id();
#else
using namespace hip::detail;
return id(Fiber::this_fiber()) % warpSize;
#endif
}
/// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile).
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int flat_block_thread_id()
{
return (threadIdx.z * blockDim.y * blockDim.x)
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
}
/// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile). Use template parameters to optimize 1D or 2D kernels.
template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto flat_block_thread_id()
-> typename std::enable_if<(BlockSizeY == 1 && BlockSizeZ == 1), unsigned int>::type
{
return threadIdx.x;
}
template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto flat_block_thread_id()
-> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ == 1), unsigned int>::type
{
return threadIdx.x + (threadIdx.y * blockDim.x);
}
template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto flat_block_thread_id()
-> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ > 1), unsigned int>::type
{
return threadIdx.x + (threadIdx.y * blockDim.x) +
(threadIdx.z * blockDim.y * blockDim.x);
}
/// \brief Returns flat (linear, 1D) thread identifier in a multidimensional tile (block).
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int flat_tile_thread_id()
{
return flat_block_thread_id();
}
/// \brief Returns warp id in a block (tile).
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int warp_id()
{
return flat_block_thread_id()/device_warp_size();
}
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int warp_id(unsigned int flat_id)
{
return flat_id/device_warp_size();
}
/// \brief Returns warp id in a block (tile). Use template parameters to optimize 1D or 2D kernels.
template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int warp_id()
{
return flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>()/device_warp_size();
}
/// \brief Returns flat (linear, 1D) block identifier in a multidimensional grid.
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int flat_block_id()
{
return (blockIdx.z * gridDim.y * gridDim.x)
+ (blockIdx.y * gridDim.x)
+ blockIdx.x;
}
template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto flat_block_id()
-> typename std::enable_if<(BlockSizeY == 1 && BlockSizeZ == 1), unsigned int>::type
{
return blockIdx.x;
}
template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto flat_block_id()
-> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ == 1), unsigned int>::type
{
return blockIdx.x + (blockIdx.y * gridDim.x);
}
template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto flat_block_id()
-> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ > 1), unsigned int>::type
{
return blockIdx.x + (blockIdx.y * gridDim.x) +
(blockIdx.z * gridDim.y * gridDim.x);
}
// Sync
/// \brief Synchronize all threads in a block (tile)
ROCPRIM_DEVICE ROCPRIM_INLINE
void syncthreads()
{
__syncthreads();
}
/// \brief All lanes in a wave come to convergence point simultaneously
/// with SIMT, thus no special instruction is needed in the ISA
ROCPRIM_DEVICE ROCPRIM_INLINE
void wave_barrier()
{
__builtin_amdgcn_wave_barrier();
}
namespace detail
{
/// \brief Returns thread identifier in a multidimensional block (tile) by dimension.
template<unsigned int Dim>
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int block_thread_id()
{
static_assert(Dim > 2, "Dim must be 0, 1 or 2");
// dummy return, correct values handled by specializations
return 0;
}
/// \brief Returns block identifier in a multidimensional grid by dimension.
template<unsigned int Dim>
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int block_id()
{
static_assert(Dim > 2, "Dim must be 0, 1 or 2");
// dummy return, correct values handled by specializations
return 0;
}
/// \brief Returns block size in a multidimensional grid by dimension.
template<unsigned int Dim>
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int block_size()
{
static_assert(Dim > 2, "Dim must be 0, 1 or 2");
// dummy return, correct values handled by specializations
return 0;
}
/// \brief Returns grid size by dimension.
template<unsigned int Dim>
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int grid_size()
{
static_assert(Dim > 2, "Dim must be 0, 1 or 2");
// dummy return, correct values handled by specializations
return 0;
}
#define ROCPRIM_DETAIL_CONCAT(A, B) A B
#define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, dim, suffix) \
template<> \
ROCPRIM_DEVICE ROCPRIM_INLINE \
unsigned int name<dim>() \
{ \
return ROCPRIM_DETAIL_CONCAT(prefix, suffix); \
}
#define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(name, prefix) \
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 0, x) \
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 1, y) \
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 2, z)
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_thread_id, threadIdx.)
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_id, blockIdx.)
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_size, blockDim.)
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(grid_size, gridDim.)
#undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS
#undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC
#undef ROCPRIM_DETAIL_CONCAT
// Return thread id in a "logical warp", which can be smaller than a hardware warp size.
template<unsigned int LogicalWarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto logical_lane_id()
-> typename std::enable_if<detail::is_power_of_two(LogicalWarpSize), unsigned int>::type
{
return lane_id() & (LogicalWarpSize-1); // same as land_id()%WarpSize
}
template<unsigned int LogicalWarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto logical_lane_id()
-> typename std::enable_if<!detail::is_power_of_two(LogicalWarpSize), unsigned int>::type
{
return lane_id()%LogicalWarpSize;
}
template<>
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int logical_lane_id<device_warp_size()>()
{
return lane_id();
}
// Return id of "logical warp" in a block
template<unsigned int LogicalWarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int logical_warp_id()
{
return flat_block_thread_id()/LogicalWarpSize;
}
template<>
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int logical_warp_id<device_warp_size()>()
{
return warp_id();
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void memory_fence_system()
{
::__threadfence_system();
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void memory_fence_block()
{
::__threadfence_block();
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void memory_fence_device()
{
::__threadfence();
}
}
/// @}
// end of group intrinsicsmodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_THREAD_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_WARP_HPP_
#define ROCPRIM_INTRINSICS_WARP_HPP_
#include "../config.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup intrinsicsmodule
/// @{
/// Evaluate predicate for all active work-items in the warp and return an integer
/// whose <tt>i</tt>-th bit is set if and only if \p predicate is <tt>true</tt>
/// for the <tt>i</tt>-th thread of the warp and the <tt>i</tt>-th thread is active.
///
/// \param predicate - input to be evaluated for all active lanes
ROCPRIM_DEVICE ROCPRIM_INLINE
lane_mask_type ballot(int predicate)
{
return ::__ballot(predicate);
}
/// \brief Masked bit count
///
/// For each thread, this function returns the number of active threads which
/// have <tt>i</tt>-th bit of \p x set and come before the current thread.
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int masked_bit_count(lane_mask_type x, unsigned int add = 0)
{
int c;
#ifndef __HIP_CPU_RT__
#if __AMDGCN_WAVEFRONT_SIZE == 32
#ifdef __CUDACC__
c = ::__builtin_amdgcn_mbcnt_lo(x, add);
#else
c = ::__mbcnt_lo(x, add);
#endif
#else
#ifdef __CUDACC__
c = ::__builtin_amdgcn_mbcnt_lo(static_cast<int>(x), add);
c = ::__builtin_amdgcn_mbcnt_hi(static_cast<int>(x >> 32), c);
#else
c = ::__mbcnt_lo(static_cast<int>(x), add);
c = ::__mbcnt_hi(static_cast<int>(x >> 32), c);
#endif
#endif
#else
using namespace hip::detail;
const auto tidx{id(Fiber::this_fiber()) % warpSize};
std::bitset<warpSize> bits{x >> (warpSize - tidx)};
c = static_cast<unsigned int>(bits.count()) + add;
#endif
return c;
}
namespace detail
{
ROCPRIM_DEVICE ROCPRIM_INLINE
int warp_any(int predicate)
{
#ifndef __HIP_CPU_RT__
return ::__any(predicate);
#else
using namespace hip::detail;
const auto tidx{id(Fiber::this_fiber()) % warpSize};
auto& lds{Tile::scratchpad<std::bitset<warpSize>, 1>()[0]};
lds[tidx] = static_cast<bool>(predicate);
barrier(Tile::this_tile());
return lds.any();
#endif
}
ROCPRIM_DEVICE ROCPRIM_INLINE
int warp_all(int predicate)
{
#ifndef __HIP_CPU_RT__
return ::__all(predicate);
#else
using namespace hip::detail;
const auto tidx{id(Fiber::this_fiber()) % warpSize};
auto& lds{Tile::scratchpad<std::bitset<warpSize>, 1>()[0]};
lds[tidx] = static_cast<bool>(predicate);
barrier(Tile::this_tile());
return lds.all();
#endif
}
} // end detail namespace
/// @}
// end of group intrinsicsmodule
/**
* Compute a 32b mask of threads having the same least-significant
* LABEL_BITS of \p label as the calling thread.
*/
template <int LABEL_BITS>
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int MatchAny(unsigned int label)
{
unsigned int retval;
// Extract masks of common threads for each bit
ROCPRIM_UNROLL
for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
{
unsigned long long mask;
unsigned long long current_bit = 1 << BIT;
mask = label & current_bit;
bool bit_match = (mask==current_bit);
mask = ballot(bit_match);
if(!bit_match)
{
mask = ! mask;
}
// Remove peers who differ
retval = (BIT == 0) ? mask : retval & mask;
}
return retval;
}
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_WARP_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
#define ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "thread.hpp"
/// \addtogroup warpmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
#ifdef __HIP_CPU_RT__
// TODO: consider adding macro checks relaying to std::bit_cast when compiled
// using C++20.
template <class To, class From>
typename std::enable_if_t<
sizeof(To) == sizeof(From) &&
std::is_trivially_copyable_v<From> &&
std::is_trivially_copyable_v<To>,
To>
// constexpr support needs compiler magic
bit_cast(const From& src) noexcept
{
To dst;
std::memcpy(&dst, &src, sizeof(To));
return dst;
}
#endif
template<class T, class ShuffleOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
typename std::enable_if<std::is_trivially_copyable<T>::value && (sizeof(T) % sizeof(int) == 0), T>::type
warp_shuffle_op(const T& input, ShuffleOp&& op)
{
constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int);
struct V { int words[words_no]; };
#ifdef __HIP_CPU_RT__
V a = bit_cast<V>(input);
#else
V a = __builtin_bit_cast(V, input);
#endif
ROCPRIM_UNROLL
for(int i = 0; i < words_no; i++)
{
a.words[i] = op(a.words[i]);
}
#ifdef __HIP_CPU_RT__
return bit_cast<T>(a);
#else
return __builtin_bit_cast(T, a);
#endif
}
template<class T, class ShuffleOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
typename std::enable_if<!(std::is_trivially_copyable<T>::value && (sizeof(T) % sizeof(int) == 0)), T>::type
warp_shuffle_op(const T& input, ShuffleOp&& op)
{
constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int);
T output;
ROCPRIM_UNROLL
for(int i = 0; i < words_no; i++)
{
const size_t s = std::min(sizeof(int), sizeof(T) - i * sizeof(int));
int word;
#ifdef __HIP_CPU_RT__
std::memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), s);
#else
__builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), s);
#endif
word = op(word);
#ifdef __HIP_CPU_RT__
std::memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, s);
#else
__builtin_memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, s);
#endif
}
return output;
}
template<class T, int dpp_ctrl, int row_mask = 0xf, int bank_mask = 0xf, bool bound_ctrl = false>
ROCPRIM_DEVICE ROCPRIM_INLINE
T warp_move_dpp(const T& input)
{
return detail::warp_shuffle_op(
input,
[=](int v) -> int
{
// TODO: clean-up, this function activates based ROCPRIM_DETAIL_USE_DPP, however inclusion and
// parsing of the template happens unconditionally. The condition causing compilation to
// fail is ordinary host-compilers looking at the headers. Non-hipcc compilers don't define
// __builtin_amdgcn_update_dpp, hence fail to parse the template altogether. (Except MSVC
// because even using /permissive- they somehow still do delayed parsing of the body of
// function templates, even though they pinky-swear they don't.)
#if !defined(__HIP_CPU_RT__)
return ::__builtin_amdgcn_mov_dpp(v, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
#else
return v;
#endif
}
);
}
/// \brief Swizzle for any data type.
///
/// Each thread in warp obtains \p input from <tt>src_lane</tt>-th thread
/// in warp, where <tt>src_lane</tt> is current lane with a <tt>mask</tt> applied.
///
/// \param input - input to pass to other threads
template<class T, int mask>
ROCPRIM_DEVICE ROCPRIM_INLINE
T warp_swizzle(const T& input)
{
return detail::warp_shuffle_op(
input,
[=](int v) -> int
{
return ::__builtin_amdgcn_ds_swizzle(v, mask);
}
);
}
} // end namespace detail
/// \brief Shuffle for any data type.
///
/// Each thread in warp obtains \p input from <tt>src_lane</tt>-th thread
/// in warp. If \p width is less than device_warp_size() then each subsection of the
/// warp behaves as a separate entity with a starting logical lane id of 0.
/// If \p src_lane is not in [0; \p width) range, the returned value is
/// equal to \p input passed by the <tt>src_lane modulo width</tt> thread.
///
/// Note: The optional \p width parameter must be a power of 2; results are
/// undefined if it is not a power of 2, or it is greater than device_warp_size().
///
/// \param input - input to pass to other threads
/// \param src_lane - warp if of a thread whose \p input should be returned
/// \param width - logical warp width
template<class T>
ROCPRIM_DEVICE ROCPRIM_INLINE
T warp_shuffle(const T& input, const int src_lane, const int width = device_warp_size())
{
return detail::warp_shuffle_op(
input,
[=](int v) -> int
{
return __shfl(v, src_lane, width);
}
);
}
/// \brief Shuffle up for any data type.
///
/// <tt>i</tt>-th thread in warp obtains \p input from <tt>i-delta</tt>-th
/// thread in warp. If \p <tt>i-delta</tt> is not in [0; \p width) range,
/// thread's own \p input is returned.
///
/// Note: The optional \p width parameter must be a power of 2; results are
/// undefined if it is not a power of 2, or it is greater than device_warp_size().
///
/// \param input - input to pass to other threads
/// \param delta - offset for calculating source lane id
/// \param width - logical warp width
template<class T>
ROCPRIM_DEVICE ROCPRIM_INLINE
T warp_shuffle_up(const T& input, const unsigned int delta, const int width = device_warp_size())
{
return detail::warp_shuffle_op(
input,
[=](int v) -> int
{
return __shfl_up(v, delta, width);
}
);
}
/// \brief Shuffle down for any data type.
///
/// <tt>i</tt>-th thread in warp obtains \p input from <tt>i+delta</tt>-th
/// thread in warp. If \p <tt>i+delta</tt> is not in [0; \p width) range,
/// thread's own \p input is returned.
///
/// Note: The optional \p width parameter must be a power of 2; results are
/// undefined if it is not a power of 2, or it is greater than device_warp_size().
///
/// \param input - input to pass to other threads
/// \param delta - offset for calculating source lane id
/// \param width - logical warp width
template<class T>
ROCPRIM_DEVICE ROCPRIM_INLINE
T warp_shuffle_down(const T& input, const unsigned int delta, const int width = device_warp_size())
{
return detail::warp_shuffle_op(
input,
[=](int v) -> int
{
return __shfl_down(v, delta, width);
}
);
}
/// \brief Shuffle XOR for any data type.
///
/// <tt>i</tt>-th thread in warp obtains \p input from <tt>i^lane_mask</tt>-th
/// thread in warp.
///
/// Note: The optional \p width parameter must be a power of 2; results are
/// undefined if it is not a power of 2, or it is greater than device_warp_size().
///
/// \param input - input to pass to other threads
/// \param lane_mask - mask used for calculating source lane id
/// \param width - logical warp width
template<class T>
ROCPRIM_DEVICE ROCPRIM_INLINE
T warp_shuffle_xor(const T& input, const int lane_mask, const int width = device_warp_size())
{
return detail::warp_shuffle_op(
input,
[=](int v) -> int
{
return __shfl_xor(v, lane_mask, width);
}
);
}
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
/// @}
// end of group warpmodule
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_HPP_
// Meta configuration for rocPRIM
#include "config.hpp"
#include "iterator/arg_index_iterator.hpp"
#include "iterator/constant_iterator.hpp"
#include "iterator/counting_iterator.hpp"
#include "iterator/discard_iterator.hpp"
#ifndef __HIP_CPU_RT__
#include "iterator/texture_cache_iterator.hpp"
#endif
#include "iterator/transform_iterator.hpp"
#include "iterator/zip_iterator.hpp"
#endif // ROCPRIM_ITERATOR_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_ARG_INDEX_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_ARG_INDEX_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include <cstddef>
#include <type_traits>
#include "../config.hpp"
#include "../types/key_value_pair.hpp"
/// \addtogroup iteratormodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \class arg_index_iterator
/// \brief A random-access input (read-only) iterator adaptor for pairing dereferenced values
/// with their indices.
///
/// \par Overview
/// * Dereferencing arg_index_iterator return a value of \p key_value_pair<Difference, InputValueType>
/// type, which includes value from the underlying range and its index in that range.
/// * \p std::iterator_traits<InputIterator>::value_type should be convertible to \p InputValueType.
///
/// \tparam InputIterator - type of the underlying random-access input iterator. Must be
/// a random-access iterator.
/// \tparam Difference - type used for identify distance between iterators and as the index type
/// in the output pair type (see \p value_type).
/// \tparam InputValueType - value type used in the output pair type (see \p value_type).
template<
class InputIterator,
class Difference = std::ptrdiff_t,
class InputValueType = typename std::iterator_traits<InputIterator>::value_type
>
class arg_index_iterator
{
private:
using input_category = typename std::iterator_traits<InputIterator>::iterator_category;
public:
/// The type of the value that can be obtained by dereferencing the iterator.
using value_type = ::rocprim::key_value_pair<Difference, InputValueType>;
/// \brief A reference type of the type iterated over (\p value_type).
/// It's `const` since arg_index_iterator is a read-only iterator.
using reference = const value_type&;
/// \brief A pointer type of the type iterated over (\p value_type).
/// It's `const` since arg_index_iterator is a read-only iterator.
using pointer = const value_type*;
/// A type used for identify distance between iterators.
using difference_type = Difference;
/// The category of the iterator.
using iterator_category = std::random_access_iterator_tag;
#ifndef DOXYGEN_SHOULD_SKIP_THIS
using self_type = arg_index_iterator;
#endif
static_assert(
std::is_same<input_category, iterator_category>::value,
"InputIterator must be a random-access iterator"
);
ROCPRIM_HOST_DEVICE inline
~arg_index_iterator() = default;
/// \brief Creates a new arg_index_iterator.
///
/// \param iterator input iterator pointing to the input range.
/// \param offset index of the \p iterator in the input range.
ROCPRIM_HOST_DEVICE inline
arg_index_iterator(InputIterator iterator, difference_type offset = 0)
: iterator_(iterator), offset_(offset)
{
}
ROCPRIM_HOST_DEVICE inline
arg_index_iterator& operator++()
{
iterator_++;
offset_++;
return *this;
}
//! \skip_doxy_start
ROCPRIM_HOST_DEVICE inline
arg_index_iterator operator++(int)
{
arg_index_iterator old_ai = *this;
iterator_++;
offset_++;
return old_ai;
}
ROCPRIM_HOST_DEVICE inline
value_type operator*() const
{
value_type ret(offset_, *iterator_);
return ret;
}
ROCPRIM_HOST_DEVICE inline
pointer operator->() const
{
return &(*(*this));
}
ROCPRIM_HOST_DEVICE inline
arg_index_iterator operator+(difference_type distance) const
{
return arg_index_iterator(iterator_ + distance, offset_ + distance);
}
ROCPRIM_HOST_DEVICE inline
arg_index_iterator& operator+=(difference_type distance)
{
iterator_ += distance;
offset_ += distance;
return *this;
}
ROCPRIM_HOST_DEVICE inline
arg_index_iterator operator-(difference_type distance) const
{
return arg_index_iterator(iterator_ - distance, offset_ - distance);
}
ROCPRIM_HOST_DEVICE inline
arg_index_iterator& operator-=(difference_type distance)
{
iterator_ -= distance;
offset_ -= distance;
return *this;
}
ROCPRIM_HOST_DEVICE inline
difference_type operator-(arg_index_iterator other) const
{
return iterator_ - other.iterator_;
}
ROCPRIM_HOST_DEVICE inline
value_type operator[](difference_type distance) const
{
arg_index_iterator i = (*this) + distance;
return *i;
}
ROCPRIM_HOST_DEVICE inline
bool operator==(arg_index_iterator other) const
{
return (iterator_ == other.iterator_) && (offset_ == other.offset_);
}
ROCPRIM_HOST_DEVICE inline
bool operator!=(arg_index_iterator other) const
{
return (iterator_ != other.iterator_) || (offset_ != other.offset_);
}
ROCPRIM_HOST_DEVICE inline
bool operator<(arg_index_iterator other) const
{
return (iterator_ - other.iterator_) > 0;
}
ROCPRIM_HOST_DEVICE inline
bool operator<=(arg_index_iterator other) const
{
return (iterator_ - other.iterator_) >= 0;
}
ROCPRIM_HOST_DEVICE inline
bool operator>(arg_index_iterator other) const
{
return (iterator_ - other.iterator_) < 0;
}
ROCPRIM_HOST_DEVICE inline
bool operator>=(arg_index_iterator other) const
{
return (iterator_ - other.iterator_) <= 0;
}
ROCPRIM_HOST_DEVICE inline
void normalize()
{
offset_ = 0;
}
friend std::ostream& operator<<(std::ostream& os, const arg_index_iterator& /* iter */)
{
return os;
}
//! \skip_doxy_end
private:
InputIterator iterator_;
difference_type offset_;
};
template<
class InputIterator,
class Difference,
class InputValueType
>
ROCPRIM_HOST_DEVICE inline
arg_index_iterator<InputIterator, Difference, InputValueType>
operator+(typename arg_index_iterator<InputIterator, Difference, InputValueType>::difference_type distance,
const arg_index_iterator<InputIterator, Difference, InputValueType>& iterator)
{
return iterator + distance;
}
/// make_arg_index_iterator creates a arg_index_iterator using \p iterator as
/// the underlying iterator and \p offset as the position (index) of \p iterator
/// in the input range.
///
/// \tparam InputIterator - type of the underlying random-access input iterator. Must be
/// a random-access iterator.
/// \tparam Difference - type used for identify distance between iterators and as the index type
/// in the output pair type (see \p value_type in arg_index_iterator).
/// \tparam InputValueType - value type used in the output pair type (see \p value_type
/// in arg_index_iterator).
///
/// \param iterator input iterator pointing to the input range.
/// \param offset index of the \p iterator in the input range.
template<
class InputIterator,
class Difference = std::ptrdiff_t,
class InputValueType = typename std::iterator_traits<InputIterator>::value_type
>
ROCPRIM_HOST_DEVICE inline
arg_index_iterator<InputIterator, Difference, InputValueType>
make_arg_index_iterator(InputIterator iterator, Difference offset = 0)
{
return arg_index_iterator<InputIterator, Difference, InputValueType>(iterator, offset);
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group iteratormodule
#endif // ROCPRIM_ITERATOR_ARG_INDEX_ITERATOR_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_CONSTANT_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_CONSTANT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include <cstddef>
#include <type_traits>
#include "../config.hpp"
/// \addtogroup iteratormodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \class constant_iterator
/// \brief A random-access input (read-only) iterator which generates a sequence
/// of homogeneous values.
///
/// \par Overview
/// * A constant_iterator represents a pointer into a range of same values.
/// * Using it for simulating a range filled with a sequence of same values saves
/// memory capacity and bandwidth.
///
/// \tparam ValueType - type of value that can be obtained by dereferencing the iterator.
/// \tparam Difference - a type used for identify distance between iterators
template<
class ValueType,
class Difference = std::ptrdiff_t
>
class constant_iterator
{
public:
/// The type of the value that can be obtained by dereferencing the iterator.
using value_type = typename std::remove_const<ValueType>::type;
/// \brief A reference type of the type iterated over (\p value_type).
/// It's same as `value_type` since constant_iterator is a read-only
/// iterator and does not have underlying buffer.
using reference = value_type; // constant_iterator is not writable
/// \brief A pointer type of the type iterated over (\p value_type).
/// It's `const` since constant_iterator is a read-only iterator.
using pointer = const value_type*; // constant_iterator is not writable
/// A type used for identify distance between iterators.
using difference_type = Difference;
/// The category of the iterator.
using iterator_category = std::random_access_iterator_tag;
#ifndef DOXYGEN_SHOULD_SKIP_THIS
using self_type = constant_iterator;
#endif
/// \brief Creates constant_iterator and sets its initial value to \p value.
///
/// \param value initial value
/// \param index optional index for constant_iterator
ROCPRIM_HOST_DEVICE inline
explicit constant_iterator(const value_type value, const size_t index = 0)
: value_(value), index_(index)
{
}
ROCPRIM_HOST_DEVICE inline
~constant_iterator() = default;
//! \skip_doxy_start
ROCPRIM_HOST_DEVICE inline
value_type operator*() const
{
return value_;
}
ROCPRIM_HOST_DEVICE inline
pointer operator->() const
{
return &value_;
}
ROCPRIM_HOST_DEVICE inline
constant_iterator& operator++()
{
index_++;
return *this;
}
ROCPRIM_HOST_DEVICE inline
constant_iterator operator++(int)
{
constant_iterator old_ci = *this;
index_++;
return old_ci;
}
ROCPRIM_HOST_DEVICE inline
constant_iterator& operator--()
{
index_--;
return *this;
}
ROCPRIM_HOST_DEVICE inline
constant_iterator operator--(int)
{
constant_iterator old_ci = *this;
index_--;
return old_ci;
}
ROCPRIM_HOST_DEVICE inline
constant_iterator operator+(difference_type distance) const
{
return constant_iterator(value_, index_ + distance);
}
ROCPRIM_HOST_DEVICE inline
constant_iterator& operator+=(difference_type distance)
{
index_ += distance;
return *this;
}
ROCPRIM_HOST_DEVICE inline
constant_iterator operator-(difference_type distance) const
{
return constant_iterator(value_, index_ - distance);
}
ROCPRIM_HOST_DEVICE inline
constant_iterator& operator-=(difference_type distance)
{
index_ -= distance;
return *this;
}
ROCPRIM_HOST_DEVICE inline
difference_type operator-(constant_iterator other) const
{
return static_cast<difference_type>(index_ - other.index_);
}
//! \skip_doxy_end
/// Constant_iterator is not writable, so we don't return reference,
/// just something convertible to reference. That matches requirement
/// of RandomAccessIterator concept
ROCPRIM_HOST_DEVICE inline
value_type operator[](difference_type) const
{
return value_;
}
//! \skip_doxy_start
ROCPRIM_HOST_DEVICE inline
bool operator==(constant_iterator other) const
{
return value_ == other.value_ && index_ == other.index_;
}
ROCPRIM_HOST_DEVICE inline
bool operator!=(constant_iterator other) const
{
return !(*this == other);
}
ROCPRIM_HOST_DEVICE inline
bool operator<(constant_iterator other) const
{
return distance_to(other) > 0;
}
ROCPRIM_HOST_DEVICE inline
bool operator<=(constant_iterator other) const
{
return distance_to(other) >= 0;
}
ROCPRIM_HOST_DEVICE inline
bool operator>(constant_iterator other) const
{
return distance_to(other) < 0;
}
ROCPRIM_HOST_DEVICE inline
bool operator>=(constant_iterator other) const
{
return distance_to(other) <= 0;
}
friend std::ostream& operator<<(std::ostream& os, const constant_iterator& iter)
{
os << "[" << iter.value_ << "]";
return os;
}
//! \skip_doxy_end
private:
inline
difference_type distance_to(const constant_iterator& other) const
{
return difference_type(other.index_) - difference_type(index_);
}
value_type value_;
size_t index_;
};
template<
class ValueType,
class Difference
>
ROCPRIM_HOST_DEVICE inline
constant_iterator<ValueType, Difference>
operator+(typename constant_iterator<ValueType, Difference>::difference_type distance,
const constant_iterator<ValueType, Difference>& iter)
{
return iter + distance;
}
/// make_constant_iterator creates a constant_iterator with its initial value
/// set to \p value.
///
/// \tparam ValueType - type of value that can be obtained by dereferencing created iterator.
/// \tparam Difference - a type used for identify distance between constant_iterator iterators.
///
/// \param value - initial value for constant_iterator.
/// \param index - optional index for constant_iterator.
template<
class ValueType,
class Difference = std::ptrdiff_t
>
ROCPRIM_HOST_DEVICE inline
constant_iterator<ValueType, Difference>
make_constant_iterator(ValueType value, size_t index = 0)
{
return constant_iterator<ValueType, Difference>(value, index);
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group iteratormodule
#endif // ROCPRIM_ITERATOR_CONSTANT_ITERATOR_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_COUNTING_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_COUNTING_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include <cstddef>
#include <type_traits>
#include "../config.hpp"
#include "../type_traits.hpp"
/// \addtogroup iteratormodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \class counting_iterator
/// \brief A random-access input (read-only) iterator over a sequence of consecutive integer values.
///
/// \par Overview
/// * A counting_iterator represents a pointer into a range of sequentially increasing values.
/// * Using it for simulating a range filled with a sequence of consecutive values saves
/// memory capacity and bandwidth.
///
/// \tparam Incrementable - type of value that can be obtained by dereferencing the iterator.
/// \tparam Difference - a type used for identify distance between iterators
template<
class Incrementable,
class Difference = std::ptrdiff_t
>
class counting_iterator
{
public:
/// The type of the value that can be obtained by dereferencing the iterator.
using value_type = typename std::remove_const<Incrementable>::type;
/// \brief A reference type of the type iterated over (\p value_type).
/// It's same as `value_type` since constant_iterator is a read-only
/// iterator and does not have underlying buffer.
using reference = value_type; // counting_iterator is not writable
/// \brief A pointer type of the type iterated over (\p value_type).
/// It's `const` since counting_iterator is a read-only iterator.
using pointer = const value_type*; // counting_iterator is not writable
/// A type used for identify distance between iterators.
using difference_type = Difference;
/// The category of the iterator.
using iterator_category = std::random_access_iterator_tag;
static_assert(std::is_integral<value_type>::value, "Incrementable must be integral type");
#ifndef DOXYGEN_SHOULD_SKIP_THIS
using self_type = counting_iterator;
#endif
ROCPRIM_HOST_DEVICE inline
counting_iterator() = default;
/// \brief Creates counting_iterator with its initial value initialized
/// to its default value (usually 0).
ROCPRIM_HOST_DEVICE inline
~counting_iterator() = default;
/// \brief Creates counting_iterator and sets its initial value to \p value_.
///
/// \param value initial value
ROCPRIM_HOST_DEVICE inline
explicit counting_iterator(const value_type value) : value_(value)
{
}
//! \skip_doxy_start
ROCPRIM_HOST_DEVICE inline
counting_iterator& operator++()
{
value_++;
return *this;
}
ROCPRIM_HOST_DEVICE inline
counting_iterator operator++(int)
{
counting_iterator old_ci = *this;
value_++;
return old_ci;
}
ROCPRIM_HOST_DEVICE inline
counting_iterator& operator--()
{
value_--;
return *this;
}
ROCPRIM_HOST_DEVICE inline
counting_iterator operator--(int)
{
counting_iterator old_ci = *this;
value_--;
return old_ci;
}
ROCPRIM_HOST_DEVICE inline
value_type operator*() const
{
return value_;
}
ROCPRIM_HOST_DEVICE inline
pointer operator->() const
{
return &value_;
}
ROCPRIM_HOST_DEVICE inline
counting_iterator operator+(difference_type distance) const
{
return counting_iterator(value_ + static_cast<value_type>(distance));
}
ROCPRIM_HOST_DEVICE inline
counting_iterator& operator+=(difference_type distance)
{
value_ += static_cast<value_type>(distance);
return *this;
}
ROCPRIM_HOST_DEVICE inline
counting_iterator operator-(difference_type distance) const
{
return counting_iterator(value_ - static_cast<value_type>(distance));
}
ROCPRIM_HOST_DEVICE inline
counting_iterator& operator-=(difference_type distance)
{
value_ -= static_cast<value_type>(distance);
return *this;
}
ROCPRIM_HOST_DEVICE inline
difference_type operator-(counting_iterator other) const
{
return static_cast<difference_type>(value_ - other.value_);
}
// counting_iterator is not writable, so we don't return reference,
// just something convertible to reference. That matches requirement
// of RandomAccessIterator concept
ROCPRIM_HOST_DEVICE inline
value_type operator[](difference_type distance) const
{
return value_ + static_cast<value_type>(distance);
}
ROCPRIM_HOST_DEVICE inline
bool operator==(counting_iterator other) const
{
return this->equal_value(value_, other.value_);
}
ROCPRIM_HOST_DEVICE inline
bool operator!=(counting_iterator other) const
{
return !(*this == other);
}
ROCPRIM_HOST_DEVICE inline
bool operator<(counting_iterator other) const
{
return distance_to(other) > 0;
}
ROCPRIM_HOST_DEVICE inline
bool operator<=(counting_iterator other) const
{
return distance_to(other) >= 0;
}
ROCPRIM_HOST_DEVICE inline
bool operator>(counting_iterator other) const
{
return distance_to(other) < 0;
}
ROCPRIM_HOST_DEVICE inline
bool operator>=(counting_iterator other) const
{
return distance_to(other) <= 0;
}
friend std::ostream& operator<<(std::ostream& os, const counting_iterator& iter)
{
os << "[" << iter.value_ << "]";
return os;
}
//! \skip_doxy_end
private:
template<class T>
inline
bool equal_value(const T& x, const T& y) const
{
return (x == y);
}
inline
difference_type distance_to(const counting_iterator& other) const
{
return difference_type(other.value_) - difference_type(value_);
}
value_type value_;
};
template<
class Incrementable,
class Difference
>
ROCPRIM_HOST_DEVICE inline
counting_iterator<Incrementable, Difference>
operator+(typename counting_iterator<Incrementable, Difference>::difference_type distance,
const counting_iterator<Incrementable, Difference>& iter)
{
return iter + distance;
}
/// make_counting_iterator creates a counting_iterator with its initial value
/// set to \p value.
///
/// \tparam Incrementable - type of value that can be obtained by dereferencing created iterator.
/// \tparam Difference - a type used for identify distance between counting_iterator iterators.
///
/// \param value - initial value for counting_iterator.
template<
class Incrementable,
class Difference = std::ptrdiff_t
>
ROCPRIM_HOST_DEVICE inline
counting_iterator<Incrementable, Difference>
make_counting_iterator(Incrementable value)
{
return counting_iterator<Incrementable, Difference>(value);
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group iteratormodule
#endif // ROCPRIM_ITERATOR_COUNTING_ITERATOR_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_REPLACE_FIRST_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_REPLACE_FIRST_ITERATOR_HPP_
#include <iterator>
#include <cstddef>
#include <type_traits>
#include "../../config.hpp"
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
// Replaces first value of given range with given value. Used in exclusive scan-by-key
// and exclusive segmented scan to avoid allocating additional memory and/or running
// additional kernels.
//
// Important: it does not dereference the first item in given range, so it does not matter
// if it's an invalid pointer.
//
// Usage:
// * input - start of your input range
// * value - value that should be used as first element of new range.
//
// replace_first_iterator<InputIterator>(input - 1, value);
//
// (input - 1) will never be dereferenced.
template<class InputIterator>
class replace_first_iterator
{
private:
using input_category = typename std::iterator_traits<InputIterator>::iterator_category;
static_assert(
std::is_same<input_category, std::random_access_iterator_tag>::value,
"InputIterator must be a random-access iterator"
);
public:
using value_type = typename std::iterator_traits<InputIterator>::value_type;
using reference = value_type;
using pointer = const value_type*;
using difference_type = typename std::iterator_traits<InputIterator>::difference_type;
using iterator_category = std::random_access_iterator_tag;
ROCPRIM_HOST_DEVICE inline
~replace_first_iterator() = default;
ROCPRIM_HOST_DEVICE inline
replace_first_iterator(InputIterator iterator, value_type value, size_t index = 0)
: iterator_(iterator), value_(value), index_(index)
{
}
ROCPRIM_HOST_DEVICE inline
replace_first_iterator& operator++()
{
iterator_++;
index_++;
return *this;
}
ROCPRIM_HOST_DEVICE inline
replace_first_iterator operator++(int)
{
replace_first_iterator old = *this;
iterator_++;
index_++;
return old;
}
ROCPRIM_HOST_DEVICE inline
value_type operator*() const
{
if(index_ == 0)
{
return value_;
}
return *iterator_;
}
ROCPRIM_HOST_DEVICE inline
value_type operator[](difference_type distance) const
{
replace_first_iterator i = (*this) + distance;
return *i;
}
ROCPRIM_HOST_DEVICE inline
replace_first_iterator operator+(difference_type distance) const
{
return replace_first_iterator(iterator_ + distance, value_, index_ + distance);
}
ROCPRIM_HOST_DEVICE inline
replace_first_iterator& operator+=(difference_type distance)
{
iterator_ += distance;
index_ += distance;
return *this;
}
private:
InputIterator iterator_;
value_type value_;
size_t index_;
};
} // end of detail namespace
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_ITERATOR_REPLACE_FIRST_ITERATOR_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment