Commit f8a481f8 authored by zhouxiang's avatar zhouxiang
Browse files

添加dtk中的cub头文件

parent 7b7c64c5
// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_HISTOGRAM_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_HISTOGRAM_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level histogram operation.
///
/// \tparam HistogramConfig - configuration of histogram kernel. Must be \p kernel_config.
/// \tparam MaxGridSize - maximim number of blocks to launch.
/// \tparam SharedImplMaxBins - maximum total number of bins for all active channels
/// for the shared memory histogram implementation (samples -> shared memory bins -> global memory bins),
/// when exceeded the global memory implementation is used (samples -> global memory bins).
template<
class HistogramConfig,
unsigned int MaxGridSize = 1024,
unsigned int SharedImplMaxBins = 2048
>
struct histogram_config
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS
using histogram = HistogramConfig;
static constexpr unsigned int max_grid_size = MaxGridSize;
static constexpr unsigned int shared_impl_max_bins = SharedImplMaxBins;
#endif
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS
template<
class HistogramConfig,
unsigned int MaxGridSize,
unsigned int SharedImplMaxBins
> constexpr unsigned int
histogram_config<HistogramConfig, MaxGridSize, SharedImplMaxBins>::max_grid_size;
template<
class HistogramConfig,
unsigned int MaxGridSize,
unsigned int SharedImplMaxBins
> constexpr unsigned int
histogram_config<HistogramConfig, MaxGridSize, SharedImplMaxBins>::shared_impl_max_bins;
#endif
namespace detail
{
template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
struct histogram_config_803
{
static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int));
using type = histogram_config<kernel_config<256, ::rocprim::max(10u / Channels / item_scale, 1u)>>;
};
template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
struct histogram_config_900
{
static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int));
using type = histogram_config<kernel_config<256, ::rocprim::max(8u / Channels / item_scale, 1u)>>;
};
// TODO: We need to update these parameters
template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
struct histogram_config_90a
{
static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int));
using type = histogram_config<kernel_config<256, ::rocprim::max(8u / Channels / item_scale, 1u)>>;
};
// TODO: We need to update these parameters
template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
struct histogram_config_1030
{
static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int));
using type = histogram_config<kernel_config<256, ::rocprim::max(8u / Channels / item_scale, 1u)>>;
};
template<unsigned int TargetArch, class Sample, unsigned int Channels, unsigned int ActiveChannels>
struct default_histogram_config
: select_arch<
TargetArch,
select_arch_case<803, histogram_config_803<Sample, Channels, ActiveChannels> >,
select_arch_case<900, histogram_config_900<Sample, Channels, ActiveChannels> >,
select_arch_case<ROCPRIM_ARCH_90a, histogram_config_90a<Sample, Channels, ActiveChannels> >,
select_arch_case<1030, histogram_config_1030<Sample, Channels, ActiveChannels> >,
histogram_config_900<Sample, Channels, ActiveChannels>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_HISTOGRAM_CONFIG_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_MERGE_HPP_
#define ROCPRIM_DEVICE_DEVICE_MERGE_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "device_merge_config.hpp"
#include "detail/device_merge.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
template<
class IndexIterator,
class KeysInputIterator1,
class KeysInputIterator2,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
void partition_kernel(IndexIterator index,
KeysInputIterator1 keys_input1,
KeysInputIterator2 keys_input2,
const size_t input1_size,
const size_t input2_size,
const unsigned int spacing,
BinaryFunction compare_function)
{
partition_kernel_impl(
index, keys_input1, keys_input2, input1_size, input2_size,
spacing, compare_function
);
}
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
class IndexIterator,
class KeysInputIterator1,
class KeysInputIterator2,
class KeysOutputIterator,
class ValuesInputIterator1,
class ValuesInputIterator2,
class ValuesOutputIterator,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void merge_kernel(IndexIterator index,
KeysInputIterator1 keys_input1,
KeysInputIterator2 keys_input2,
KeysOutputIterator keys_output,
ValuesInputIterator1 values_input1,
ValuesInputIterator2 values_input2,
ValuesOutputIterator values_output,
const size_t input1_size,
const size_t input2_size,
BinaryFunction compare_function)
{
merge_kernel_impl<BlockSize, ItemsPerThread>(
index, keys_input1, keys_input2, keys_output,
values_input1, values_input2, values_output,
input1_size, input2_size, compare_function
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template<
class Config,
class KeysInputIterator1,
class KeysInputIterator2,
class KeysOutputIterator,
class ValuesInputIterator1,
class ValuesInputIterator2,
class ValuesOutputIterator,
class BinaryFunction
>
inline
cudaError_t merge_impl(void * temporary_storage,
size_t& storage_size,
KeysInputIterator1 keys_input1,
KeysInputIterator2 keys_input2,
KeysOutputIterator keys_output,
ValuesInputIterator1 values_input1,
ValuesInputIterator2 values_input2,
ValuesOutputIterator values_output,
const size_t input1_size,
const size_t input2_size,
BinaryFunction compare_function,
const cudaStream_t stream,
bool debug_synchronous)
{
using key_type = typename std::iterator_traits<KeysInputIterator1>::value_type;
using value_type = typename std::iterator_traits<ValuesInputIterator1>::value_type;
// Get default config if Config is default_config
using config = detail::default_or_custom_config<
Config,
detail::default_merge_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
>;
static constexpr unsigned int block_size = config::block_size;
static constexpr unsigned int half_block = block_size / 2;
static constexpr unsigned int items_per_thread = config::items_per_thread;
static constexpr auto items_per_block = block_size * items_per_thread;
const unsigned int partitions = ((input1_size + input2_size) + items_per_block - 1) / items_per_block;
const size_t partition_bytes = (partitions + 1) * sizeof(unsigned int);
if(temporary_storage == nullptr)
{
// storage_size is never zero
storage_size = partition_bytes;
return cudaSuccess;
}
if( partitions == 0u )
return cudaSuccess;
// Start point for time measurements
std::chrono::high_resolution_clock::time_point start;
auto number_of_blocks = partitions;
if(debug_synchronous)
{
std::cout << "block_size " << block_size << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "items_per_block " << items_per_block << '\n';
}
unsigned int * index = reinterpret_cast<unsigned int *>(temporary_storage);
const unsigned partition_blocks = ((partitions + 1) + half_block - 1) / half_block;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
detail::partition_kernel
<<<dim3(partition_blocks), dim3(half_block), 0, stream>>>(
index, keys_input1, keys_input2, input1_size, input2_size,
items_per_block, compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("partition_kernel", input1_size, start);
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
detail::merge_kernel<block_size, items_per_thread>
<<<dim3(number_of_blocks), dim3(block_size), 0, stream>>>(
index, keys_input1, keys_input2, keys_output,
values_input1, values_input2, values_output,
input1_size, input2_size, compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("merge_kernel", input1_size, start);
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
} // end of detail namespace
/// \brief Parallel merge primitive for device level.
///
/// \p merge function performs a device-wide merge.
/// Function merges two ordered sets of input values based on comparison function.
///
/// \par Overview
/// * The contents of the inputs are not altered by the merging function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Accepts custom compare_functions for merging across the device.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or
/// a custom class with the same members.
/// \tparam InputIterator1 - random-access iterator type of the first input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam InputIterator2 - random-access iterator type of the second input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input1 - iterator to the first element in the first range to merge.
/// \param [in] input2 - iterator to the first element in the second range to merge.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] input1_size - number of element in the first input range.
/// \param [in] input2_size - number of element in the second input range.
/// \param [in] compare_function - binary operation function object that will be used for comparison.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending merge is performed on an array of
/// \p int values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size1; // e.g., 4
/// size_t input_size2; // e.g., 4
/// int * input1; // e.g., [0, 1, 2, 3]
/// int * input2; // e.g., [0, 1, 2, 3]
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::merge(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input1, input2, output, input_size1, input_size2
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform merge
/// rocprim::merge(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input1, input2, output, input_size1, input_size2
/// );
/// // output: [0, 0, 1, 1, 2, 2, 3, 3]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator1,
class InputIterator2,
class OutputIterator,
class BinaryFunction = ::rocprim::less<typename std::iterator_traits<InputIterator1>::value_type>
>
inline
cudaError_t merge(void * temporary_storage,
size_t& storage_size,
InputIterator1 input1,
InputIterator2 input2,
OutputIterator output,
const size_t input1_size,
const size_t input2_size,
BinaryFunction compare_function = BinaryFunction(),
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
empty_type * values = nullptr;
return detail::merge_impl<Config>(
temporary_storage, storage_size,
input1, input2, output,
values, values, values,
input1_size, input2_size, compare_function,
stream, debug_synchronous
);
}
/// \brief Parallel merge primitive for device level.
///
/// \p merge function performs a device-wide merge of (key, value) pairs.
/// Function merges two ordered sets of input keys and corresponding values
/// based on key comparison function.
///
/// \par Overview
/// * The contents of the inputs are not altered by the merging function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Accepts custom compare_functions for merging across the device.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator1 - random-access iterator type of the first keys input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysInputIterator2 - random-access iterator type of the second keys input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the keys output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator1 - random-access iterator type of the first values input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator2 - random-access iterator type of the second values input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the values output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input1 - iterator to the first key in the first range to merge.
/// \param [in] keys_input2 - iterator to the first key in the second range to merge.
/// \param [out] keys_output - iterator to the first key in the output range.
/// \param [in] values_input1 - iterator to the first value in the first range to merge.
/// \param [in] values_input2 - iterator to the first value in the second range to merge.
/// \param [out] values_output - iterator to the first value in the output range.
/// \param [in] input1_size - number of element in the first input range.
/// \param [in] input2_size - number of element in the second input range.
/// \param [in] compare_function - binary operation function object that will be used for key comparison.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending merge is performed on an array of
/// \p int values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size1; // e.g., 4
/// size_t input_size2; // e.g., 4
/// int * keys_input1; // e.g., [0, 1, 2, 3]
/// int * keys_input2; // e.g., [0, 1, 2, 3]
/// int * keys_output; // empty array of 8 elements
/// int * values_input1; // e.g., [10, 11, 12, 13]
/// int * values_input2; // e.g., [20, 21, 22, 23]
/// int * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::merge(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input1, keys_input2, keys_output,
/// values_input1, values_input2, values_output,
// input_size1, input_size2
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform merge
/// rocprim::merge(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input1, keys_input2, keys_output,
/// values_input1, values_input2, values_output,
// input_size1, input_size2
/// );
/// // keys_output: [0, 0, 1, 1, 2, 2, 3, 3]
/// // values_output: [10, 20, 11, 21, 12, 22, 13, 23]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator1,
class KeysInputIterator2,
class KeysOutputIterator,
class ValuesInputIterator1,
class ValuesInputIterator2,
class ValuesOutputIterator,
class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator1>::value_type>
>
inline
cudaError_t merge(void * temporary_storage,
size_t& storage_size,
KeysInputIterator1 keys_input1,
KeysInputIterator2 keys_input2,
KeysOutputIterator keys_output,
ValuesInputIterator1 values_input1,
ValuesInputIterator2 values_input2,
ValuesOutputIterator values_output,
const size_t input1_size,
const size_t input2_size,
BinaryFunction compare_function = BinaryFunction(),
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return detail::merge_impl<Config>(
temporary_storage, storage_size,
keys_input1, keys_input2, keys_output,
values_input1, values_input2, values_output,
input1_size, input2_size, compare_function,
stream, debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_MERGE_HPP_
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_MERGE_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_MERGE_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level merge primitives.
template<unsigned int BlockSize, unsigned int ItemsPerThread>
using merge_config = kernel_config<BlockSize, ItemsPerThread>;
namespace detail
{
template<class Key, class Value>
struct merge_config_803
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
// TODO Tune when merge-by-key is ready
using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>;
};
template<class Key>
struct merge_config_803<Key, empty_type>
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
using type = select_type<
select_type_case<sizeof(Key) <= 2, merge_config<256, 11> >,
select_type_case<sizeof(Key) <= 4, merge_config<256, 10> >,
select_type_case<sizeof(Key) <= 8, merge_config<256, 7> >,
merge_config<256, ::rocprim::max(1u, 10u / item_scale)>
>;
};
template<class Key, class Value>
struct merge_config_900
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
// TODO Tune when merge-by-key is ready
using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>;
};
template<class Key>
struct merge_config_900<Key, empty_type>
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
using type = select_type<
select_type_case<sizeof(Key) <= 2, merge_config<256, 11> >,
select_type_case<sizeof(Key) <= 4, merge_config<256, 10> >,
select_type_case<sizeof(Key) <= 8, merge_config<256, 7> >,
merge_config<256, ::rocprim::max(1u, 10u / item_scale)>
>;
};
// TODO: We need to update these parameters
template<class Key, class Value>
struct merge_config_90a
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
// TODO Tune when merge-by-key is ready
using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>;
};
template<class Key>
struct merge_config_90a<Key, empty_type>
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
using type = select_type<
select_type_case<sizeof(Key) <= 2, merge_config<256, 11> >,
select_type_case<sizeof(Key) <= 4, merge_config<256, 10> >,
select_type_case<sizeof(Key) <= 8, merge_config<256, 7> >,
merge_config<256, ::rocprim::max(1u, 10u / item_scale)>
>;
};
// TODO: We need to update these parameters
template<class Key, class Value>
struct merge_config_1030
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
// TODO Tune when merge-by-key is ready
using type = merge_config<256, ::rocprim::max(1u, 10u / item_scale)>;
};
template<class Key>
struct merge_config_1030<Key, empty_type>
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key), sizeof(int));
using type = select_type<
select_type_case<sizeof(Key) <= 2, merge_config<256, 11> >,
select_type_case<sizeof(Key) <= 4, merge_config<256, 10> >,
select_type_case<sizeof(Key) <= 8, merge_config<256, 7> >,
merge_config<256, ::rocprim::max(1u, 10u / item_scale)>
>;
};
template<unsigned int TargetArch, class Key, class Value>
struct default_merge_config
: select_arch<
TargetArch,
select_arch_case<803, merge_config_803<Key, Value>>,
select_arch_case<900, merge_config_900<Key, Value>>,
select_arch_case<ROCPRIM_ARCH_90a, merge_config_90a<Key, Value>>,
select_arch_case<1030, merge_config_1030<Key, Value>>,
merge_config_900<Key, Value>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_MERGE_CONFIG_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SORT_HPP_
#define ROCPRIM_DEVICE_DEVICE_SORT_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "detail/device_merge.hpp"
#include "detail/device_merge_sort.hpp"
#include "detail/device_merge_sort_mergepath.hpp"
#include "device_transform.hpp"
#include "device_merge_sort_config.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class OffsetT,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void block_sort_kernel(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
const OffsetT size,
BinaryFunction compare_function)
{
block_sort_kernel_impl<BlockSize, ItemsPerThread>(
keys_input, keys_output, values_input, values_output,
size, compare_function
);
}
template<
unsigned int BlockSize,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class OffsetT,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void block_merge_kernel(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
const OffsetT input_size,
const OffsetT sorted_block_size,
BinaryFunction compare_function)
{
block_merge_kernel_impl<BlockSize>(keys_input,
keys_output,
values_input,
values_output,
input_size,
sorted_block_size,
compare_function);
}
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class OffsetT,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void block_merge_kernel(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
const OffsetT input_size,
const OffsetT sorted_block_size,
BinaryFunction compare_function,
const OffsetT* merge_partitions)
{
block_merge_kernel_impl<BlockSize, ItemsPerThread>(keys_input,
keys_output,
values_input,
values_output,
input_size,
sorted_block_size,
compare_function,
merge_partitions);
}
#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template <unsigned int BlockSize, // BlockSize of the partition kernel
unsigned int ItemsPerTile, // ItemsPerTile of the block merge kernel
typename KeysInputIterator,
typename OffsetT,
typename CompareOpT>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void device_mergepath_partition_kernel(KeysInputIterator keys,
const OffsetT input_size,
const unsigned int num_partitions,
OffsetT *merge_partitions,
const CompareOpT compare_op,
const OffsetT sorted_block_size)
{
const OffsetT partition_id = blockIdx.x * BlockSize + threadIdx.x;
if (partition_id >= num_partitions)
{
return;
}
const unsigned int merged_tiles = sorted_block_size / ItemsPerTile;
const unsigned int target_merged_tiles = merged_tiles * 2;
const unsigned int mask = target_merged_tiles - 1;
const unsigned int tilegroup_start_id = ~mask & partition_id; // id of the first tile in the current tile-group
const OffsetT tilegroup_start = ItemsPerTile * tilegroup_start_id; // index of the first item in the current tile-group
const unsigned int local_tile_id = mask & partition_id; // id of the current tile in the current tile-group
const OffsetT keys1_beg = rocprim::min(input_size, tilegroup_start);
const OffsetT keys1_end = rocprim::min(input_size, tilegroup_start + sorted_block_size);
const OffsetT keys2_beg = keys1_end;
const OffsetT keys2_end = rocprim::min(input_size, keys2_beg + sorted_block_size);
const OffsetT partition_at = rocprim::min<OffsetT>(keys2_end - keys1_beg, ItemsPerTile * local_tile_id);
const OffsetT partition_diag = ::rocprim::detail::merge_path(keys + keys1_beg,
keys + keys2_beg,
keys1_end - keys1_beg,
keys2_end - keys2_beg,
partition_at,
compare_op);
merge_partitions[partition_id] = keys1_beg + partition_diag;
}
template<
class Config,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class BinaryFunction
>
inline
cudaError_t merge_sort_impl(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
const unsigned int size,
BinaryFunction compare_function,
const cudaStream_t stream,
bool debug_synchronous)
{
using OffsetT = unsigned int;
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
// Get default config if Config is default_config
using config = default_or_custom_config<
Config,
default_merge_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
>;
static constexpr unsigned int sort_block_size = config::sort_config::block_size;
static constexpr unsigned int sort_items_per_thread = config::sort_config::items_per_thread;
static constexpr unsigned int sort_items_per_block = sort_block_size * sort_items_per_thread;
static constexpr unsigned int merge_impl1_block_size = config::merge_impl1_config::block_size;
static constexpr unsigned int merge_impl1_items_per_thread = config::merge_impl1_config::items_per_thread;
static constexpr unsigned int merge_impl1_items_per_block = merge_impl1_block_size * merge_impl1_items_per_thread;
static constexpr unsigned int merge_partition_block_size = config::merge_mergepath_partition_config::block_size;
static constexpr unsigned int merge_mergepath_block_size = config::merge_mergepath_config::block_size;
static constexpr unsigned int merge_mergepath_items_per_thread = config::merge_mergepath_config::items_per_thread;
static constexpr unsigned int merge_mergepath_items_per_block = merge_mergepath_block_size * merge_mergepath_items_per_thread;
static_assert(merge_mergepath_items_per_block >= sort_items_per_block,
"merge_mergepath_items_per_block must be greater than or equal to sort_items_per_block");
static_assert(sort_items_per_block % config::merge_impl1_config::block_size == 0,
"Merge block size must be a divisor of the items per block of the sort step");
const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type));
const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0;
const unsigned int sort_number_of_blocks = ceiling_div(size, sort_items_per_block);
const unsigned int merge_impl1_number_of_blocks = ceiling_div(size, merge_impl1_items_per_block);
const unsigned int merge_mergepath_number_of_blocks = ceiling_div(size, merge_mergepath_items_per_block);
bool use_mergepath = size > config::min_input_size_mergepath;
// variables below used for mergepath
const unsigned int merge_num_partitions = merge_mergepath_number_of_blocks + 1;
const unsigned int merge_partition_number_of_blocks = ceiling_div(merge_num_partitions, merge_partition_block_size);
const size_t d_merge_partitions_bytes = use_mergepath ? merge_num_partitions * sizeof(OffsetT) : 0;
if(temporary_storage == nullptr)
{
storage_size = d_merge_partitions_bytes + keys_bytes + values_bytes;
// Make sure user won't try to allocate 0 bytes memory
storage_size = storage_size == 0 ? 4 : storage_size;
return cudaSuccess;
}
if( size == size_t(0) )
return cudaSuccess;
if(debug_synchronous)
{
std::cout << "-----" << '\n';
std::cout << "size: " << size << '\n';
std::cout << "sort_block_size: " << sort_block_size << '\n';
std::cout << "sort_items_per_thread: " << sort_items_per_thread << '\n';
std::cout << "sort_items_per_block: " << sort_items_per_block << '\n';
std::cout << "sort_number_of_blocks: " << sort_number_of_blocks << '\n';
std::cout << "merge_impl1_block_size: " << merge_impl1_block_size << '\n';
std::cout << "merge_impl1_number_of_blocks: " << merge_impl1_number_of_blocks << '\n';
std::cout << "merge_impl1_items_per_thread: " << merge_impl1_items_per_thread << '\n';
std::cout << "merge_impl1_items_per_block: " << merge_impl1_items_per_block << '\n';
std::cout << "merge_mergepath_block_size: " << merge_mergepath_block_size << '\n';
std::cout << "merge_mergepath_number_of_blocks: " << merge_mergepath_number_of_blocks << '\n';
std::cout << "merge_mergepath_items_per_thread: " << merge_mergepath_items_per_thread << '\n';
std::cout << "merge_mergepath_items_per_block: " << merge_mergepath_items_per_block << '\n';
std::cout << "num_partitions: " << merge_num_partitions << '\n';
std::cout << "merge_mergepath_partition_block_size: " << merge_partition_block_size << '\n';
std::cout << "merge_mergepath_partition_number_of_blocks: " << merge_partition_number_of_blocks << '\n';
}
char* ptr = reinterpret_cast<char*>(temporary_storage);
OffsetT* d_merge_partitions = reinterpret_cast<OffsetT*>(ptr);
ptr += d_merge_partitions_bytes;
key_type * keys_buffer = reinterpret_cast<key_type*>(ptr);
ptr += keys_bytes;
value_type * values_buffer = with_values ? reinterpret_cast<value_type*>(ptr) : nullptr;
// Start point for time measurements
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
block_sort_kernel<sort_block_size, sort_items_per_thread>
<<<dim3(sort_number_of_blocks), dim3(sort_block_size), 0, stream>>>(
keys_input, keys_buffer, values_input, values_buffer,
size, compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_sort_kernel", size, start);
bool temporary_store = true;
for(OffsetT block = sort_items_per_block; block < size; block *= 2)
{
temporary_store = !temporary_store;
const auto merge_step = [&](auto keys_input_,
auto keys_output_,
auto values_input_,
auto values_output_) -> cudaError_t {
if(use_mergepath)
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
device_mergepath_partition_kernel<merge_partition_block_size, merge_mergepath_items_per_block>
<<<dim3(merge_partition_number_of_blocks), dim3(merge_partition_block_size), 0, stream>>>(
keys_input_, size, merge_num_partitions, d_merge_partitions,
compare_function, block);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("device_mergepath_partition_kernel", size, start);
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
block_merge_kernel<merge_mergepath_block_size, merge_mergepath_items_per_thread>
<<<dim3(merge_mergepath_number_of_blocks), dim3(merge_mergepath_block_size), 0, stream>>>(
keys_input_, keys_output_, values_input_, values_output_,
size, block, compare_function, d_merge_partitions
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start);
}
else
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
block_merge_kernel<merge_impl1_block_size>
<<<dim3(merge_impl1_number_of_blocks), dim3(merge_impl1_block_size), 0, stream>>>(
keys_input_, keys_output_, values_input_, values_output_,
size, block, compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_merge_kernel", size, start)
}
return cudaSuccess;
};
cudaError_t error;
if(temporary_store)
{
error = merge_step(keys_output, keys_buffer, values_output, values_buffer);
}
else
{
error = merge_step(keys_buffer, keys_output, values_buffer, values_output);
}
if(error != cudaSuccess) return error;
}
if(temporary_store)
{
cudaError_t error = ::rocprim::transform(
keys_buffer, keys_output, size,
::rocprim::identity<key_type>(), stream, debug_synchronous
);
if(error != cudaSuccess) return error;
if(with_values)
{
cudaError_t error = ::rocprim::transform(
values_buffer, values_output, size,
::rocprim::identity<value_type>(), stream, debug_synchronous
);
if(error != cudaSuccess) return error;
}
}
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#undef ROCPRIM_DETAIL_HIP_SYNC
} // end of detail namespace
/// \brief Parallel merge sort primitive for device level.
///
/// \p merge_sort function performs a device-wide merge sort
/// of keys. Function sorts input keys based on comparison function.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Accepts custom compare_functions for sorting across the device.
///
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] compare_function - binary operation function object that will be used for comparison.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending merge sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::merge_sort(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::merge_sort(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
/// // keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator>::value_type>
>
inline
cudaError_t merge_sort(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
const size_t size,
BinaryFunction compare_function = BinaryFunction(),
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
empty_type * values = nullptr;
return detail::merge_sort_impl<Config>(
temporary_storage, storage_size,
keys_input, keys_output, values, values, size,
compare_function, stream, debug_synchronous
);
}
/// \brief Parallel ascending merge sort-by-key primitive for device level.
///
/// \p merge_sort function performs a device-wide merge sort
/// of (key, value) pairs. Function sorts input pairs based on comparison function.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Accepts custom compare_functions for sorting across the device.
///
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] compare_function - binary operation function object that will be used for comparison.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending merge sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 2, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::merge_sort(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::merge_sort(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size
/// );
/// // keys_output: [ 1, 2, 3, 4, 5, 6, 7, 8]
/// // values_output: [-1, -2, 2, 3, -4, -5, 7, -8]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator>::value_type>
>
inline
cudaError_t merge_sort(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
const size_t size,
BinaryFunction compare_function = BinaryFunction(),
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return detail::merge_sort_impl<Config>(
temporary_storage, storage_size,
keys_input, keys_output, values_input, values_output, size,
compare_function, stream, debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SORT_HPP_
// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_MERGE_SORT_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_MERGE_SORT_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../functional.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
template <unsigned int SortBlockSize,
unsigned int SortItemsPerThread,
unsigned int MergeImpl1BlockSize,
unsigned int MergeImplMPPartitionBlockSize,
unsigned int MergeImplMPBlockSize,
unsigned int MergeImplMPItemsPerThread,
unsigned int MinInputSizeMergepath>
struct merge_sort_config_impl
{
using sort_config = kernel_config<SortBlockSize, SortItemsPerThread>;
using merge_impl1_config = kernel_config<MergeImpl1BlockSize, 1>;
using merge_mergepath_partition_config = kernel_config<MergeImplMPPartitionBlockSize, 1>;
using merge_mergepath_config
= kernel_config<MergeImplMPBlockSize, MergeImplMPItemsPerThread>;
static constexpr unsigned int min_input_size_mergepath = MinInputSizeMergepath;
};
}
/// \brief Configuration of device-level merge primitives.
///
/// \tparam SortBlockSize - block size in the block-sort step
/// \tparam SortItemsPerThread - ItemsPerThread in the block-sort step
/// \tparam MergeImpl1BlockSize - block size in the block merge step using impl1 (used when input_size < MinInputSizeMergepath)
/// \tparam MergeImplMPPartitionBlockSize - block size of the partition kernel in the block merge step using mergepath impl
/// \tparam MergeImplMPBlockSize - block size in the block merge step using mergepath impl
/// \tparam MergeImplMPItemsPerThread - ItemsPerThread in the block merge step using mergepath impl
/// \tparam MinInputSizeMergepath - breakpoint of input-size to use mergepath impl for block merge step
template<unsigned int MergeImpl1BlockSize = 512,
unsigned int SortBlockSize = MergeImpl1BlockSize,
unsigned int SortItemsPerThread = 1,
unsigned int MergeImplMPPartitionBlockSize = 128,
unsigned int MergeImplMPBlockSize = std::min(SortBlockSize, 128u),
unsigned int MergeImplMPItemsPerThread
= SortBlockSize* SortItemsPerThread / MergeImplMPBlockSize,
unsigned int MinInputSizeMergepath = 200000>
using merge_sort_config = detail::merge_sort_config_impl<SortBlockSize,
SortItemsPerThread,
MergeImpl1BlockSize,
MergeImplMPPartitionBlockSize,
MergeImplMPBlockSize,
MergeImplMPItemsPerThread,
MinInputSizeMergepath>;
namespace detail
{
template<class Key, class Value>
struct merge_sort_config_803
{
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
merge_sort_config<64U>
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
merge_sort_config<256U>
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
merge_sort_config<512U>
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
merge_sort_config<1024U>
>,
merge_sort_config<limit_block_size<1024U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value>
>;
};
template<class Value>
struct merge_sort_config_803<rocprim::half, Value>
{
using type = merge_sort_config<limit_block_size<256U, sizeof(rocprim::half) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value>;
};
template<class Key>
struct merge_sort_config_803<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, merge_sort_config<64U> >,
select_type_case<sizeof(Key) == 2, merge_sort_config<256U> >,
select_type_case<sizeof(Key) == 4, merge_sort_config<256U> >,
select_type_case<sizeof(Key) >= 8, merge_sort_config<limit_block_size<512U, sizeof(Key), ROCPRIM_WARP_SIZE_64>::value> >
> { };
template<>
struct merge_sort_config_803<rocprim::half, empty_type>
{
using type = merge_sort_config<256U>;
};
template<class Key, class Value, bool = is_scalar<Key>::value>
struct merge_sort_config_900
{
using type = select_type<
// clang-format off
select_type_case<(sizeof(Key) == 1 && sizeof(Value) <= 16), merge_sort_config<512U, 512U, 2U>>,
select_type_case<(sizeof(Key) == 2 && sizeof(Value) <= 16), merge_sort_config<512U, 256U, 4U>>,
select_type_case<(sizeof(Key) == 4 && sizeof(Value) <= 16), merge_sort_config<512U, 256U, 4U>>,
select_type_case<(sizeof(Key) == 8 && sizeof(Value) <= 16), merge_sort_config<256U, 256U, 4U>>,
// clang-format on
merge_sort_config<
limit_block_size<1024U,
::rocprim::max(sizeof(Key) + sizeof(unsigned int), sizeof(Value)),
ROCPRIM_WARP_SIZE_64>::value>>;
};
template<class Key, class Value>
struct merge_sort_config_900<Key, Value, false>
{
using type = select_type<
// clang-format off
select_type_case<(sizeof(Key) == 8 && sizeof(Value) <= 16), merge_sort_config<512U, 512U, 2U>>,
select_type_case<(sizeof(Key) == 16 && sizeof(Value) <= 16), merge_sort_config<512U, 512U, 2U>>,
// clang-format on
merge_sort_config<
limit_block_size<512U,
::rocprim::max(sizeof(Key) + sizeof(unsigned int), sizeof(Value)),
ROCPRIM_WARP_SIZE_64>::value>>;
};
// TODO: We need to update these parameters
template<class Key, class Value>
struct merge_sort_config_1030
{
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
merge_sort_config<64U>
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
merge_sort_config<256U>
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
merge_sort_config<512U>
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
merge_sort_config<1024U>
>,
merge_sort_config<limit_block_size<1024U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value>
>;
};
template<class Value>
struct merge_sort_config_1030<rocprim::half, Value>
{
using type = merge_sort_config<limit_block_size<256U, sizeof(rocprim::half) + sizeof(Value), ROCPRIM_WARP_SIZE_32>::value>;
};
template<class Key>
struct merge_sort_config_1030<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, merge_sort_config<64U> >,
select_type_case<sizeof(Key) == 2, merge_sort_config<256U> >,
select_type_case<sizeof(Key) == 4, merge_sort_config<256U> >,
select_type_case<sizeof(Key) >= 8, merge_sort_config<limit_block_size<512U, sizeof(Key), ROCPRIM_WARP_SIZE_32>::value> >
> { };
template<>
struct merge_sort_config_1030<rocprim::half, empty_type>
{
using type = merge_sort_config<256U>;
};
template<unsigned int TargetArch, class Key, class Value>
struct default_merge_sort_config
: select_arch<
TargetArch,
select_arch_case<803, merge_sort_config_803<Key, Value>>,
select_arch_case<900, merge_sort_config_900<Key, Value>>,
select_arch_case<1030, merge_sort_config_1030<Key, Value>>,
merge_sort_config_900<Key, Value>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_MERGE_SORT_CONFIG_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_
#define ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_
#include <algorithm>
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "../type_traits.hpp"
#include "../detail/various.hpp"
#include "device_select_config.hpp"
#include "detail/device_scan_common.hpp"
#include "detail/device_partition.hpp"
#include "device_transform.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
template<
select_method SelectMethod,
bool OnlySelected,
class Config,
class KeyIterator,
class ValueIterator,
class FlagIterator,
class OutputKeyIterator,
class OutputValueIterator,
class InequalityOp,
class OffsetLookbackScanState,
class... UnaryPredicates
>
ROCPRIM_KERNEL
__launch_bounds__(Config::block_size)
void partition_kernel(KeyIterator keys_input,
ValueIterator values_input,
FlagIterator flags,
OutputKeyIterator keys_output,
OutputValueIterator values_output,
size_t* selected_count,
size_t* prev_selected_count,
const size_t size,
InequalityOp inequality_op,
OffsetLookbackScanState offset_scan_state,
const unsigned int number_of_blocks,
ordered_block_id<unsigned int> ordered_bid,
UnaryPredicates... predicates)
{
partition_kernel_impl<SelectMethod, OnlySelected, Config>(
keys_input, values_input, flags, keys_output, values_output, selected_count, prev_selected_count,
size, inequality_op, offset_scan_state, number_of_blocks, ordered_bid, predicates...
);
}
#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template<
// Method of selection: flag, predicate, unique
select_method SelectMethod,
// if true, it doesn't copy rejected values to output
bool OnlySelected,
class Config,
class OffsetT,
class KeyIterator,
class ValueIterator, // can be rocprim::empty_type* for key only
class FlagIterator,
class OutputKeyIterator,
class OutputValueIterator, // can be rocprim::empty_type* for key only
class InequalityOp,
class SelectedCountOutputIterator,
class... UnaryPredicates
>
inline
cudaError_t partition_impl(void * temporary_storage,
size_t& storage_size,
KeyIterator keys_input,
ValueIterator values_input,
FlagIterator flags,
OutputKeyIterator keys_output,
OutputValueIterator values_output,
SelectedCountOutputIterator selected_count_output,
const size_t size,
InequalityOp inequality_op,
const cudaStream_t stream,
bool debug_synchronous,
UnaryPredicates... predicates)
{
using offset_type = OffsetT;
using key_type = typename std::iterator_traits<KeyIterator>::value_type;
using value_type = typename std::iterator_traits<ValueIterator>::value_type;
// Get default config if Config is default_config
using config = default_or_custom_config<
Config,
default_select_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
>;
using offset_scan_state_type = detail::lookback_scan_state<offset_type>;
using offset_scan_state_with_sleep_type = detail::lookback_scan_state<offset_type, true>;
using ordered_block_id_type = detail::ordered_block_id<unsigned int>;
static constexpr unsigned int block_size = config::block_size;
static constexpr unsigned int items_per_thread = config::items_per_thread;
static constexpr auto items_per_block = block_size * items_per_thread;
static constexpr bool is_three_way = sizeof...(UnaryPredicates) == 2;
static constexpr size_t size_limit = config::size_limit;
static constexpr size_t aligned_size_limit = ::rocprim::max<size_t>(size_limit - (size_limit % items_per_block), items_per_block);
const size_t limited_size = std::min<size_t>(size, aligned_size_limit);
const bool use_limited_size = limited_size == aligned_size_limit;
const unsigned int number_of_blocks =
static_cast<unsigned int>(::rocprim::detail::ceiling_div(limited_size, items_per_block));
// Calculate required temporary storage
size_t offset_scan_state_bytes = ::rocprim::detail::align_size(
// This is valid even with offset_scan_state_with_sleep_type
offset_scan_state_type::get_storage_size(number_of_blocks)
);
size_t ordered_block_id_bytes = ::rocprim::detail::align_size(
ordered_block_id_type::get_storage_size(),
alignof(size_t)
);
if(temporary_storage == nullptr)
{
// storage_size is never zero
storage_size = offset_scan_state_bytes + ordered_block_id_bytes + (sizeof(size_t) * 2 * (is_three_way ? 2 : 1));
return cudaSuccess;
}
// Start point for time measurements
std::chrono::high_resolution_clock::time_point start;
// Create and initialize lookback_scan_state obj
auto offset_scan_state = offset_scan_state_type::create(
temporary_storage, number_of_blocks
);
auto offset_scan_state_with_sleep = offset_scan_state_with_sleep_type::create(
temporary_storage, number_of_blocks
);
// Create ad initialize ordered_block_id obj
auto ptr = reinterpret_cast<char*>(temporary_storage);
auto ordered_bid = ordered_block_id_type::create(
reinterpret_cast<ordered_block_id_type::id_type*>(ptr + offset_scan_state_bytes)
);
size_t* selected_count = reinterpret_cast<size_t*>(ptr + offset_scan_state_bytes
+ ordered_block_id_bytes);
size_t* prev_selected_count
= reinterpret_cast<size_t*>(ptr + offset_scan_state_bytes + ordered_block_id_bytes
+ (is_three_way ? 2 : 1) * sizeof(size_t));
cudaError_t error;
// Memset selected_count and prev_selected_count at once
error = cudaMemsetAsync(selected_count,
0,
sizeof(*selected_count) * 2 * (is_three_way ? 2 : 1),
stream);
if (error != cudaSuccess) return error;
cudaDeviceProp prop;
int deviceId;
static_cast<void>(cudaGetDevice(&deviceId));
static_cast<void>(cudaGetDeviceProperties(&prop, deviceId));
int asicRevision = 0;
const size_t number_of_launches = ::rocprim::detail::ceiling_div(size, aligned_size_limit);
if(debug_synchronous)
{
std::cout << "use_limited_size " << use_limited_size << '\n';
std::cout << "aligned_size_limit " << aligned_size_limit << '\n';
std::cout << "number_of_launches " << number_of_launches << '\n';
std::cout << "size " << size << '\n';
std::cout << "block_size " << block_size << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "items_per_block " << items_per_block << '\n';
}
for (size_t i = 0, offset = 0; i < number_of_launches; i++, offset+=limited_size)
{
const unsigned int current_size = static_cast<unsigned int>(std::min<size_t>(size - offset, limited_size));
const unsigned int current_number_of_blocks = ::rocprim::detail::ceiling_div(current_size, items_per_block);
auto grid_size = ::rocprim::detail::ceiling_div(number_of_blocks, block_size);
if(debug_synchronous)
{
std::cout << "current size " << current_size << '\n';
std::cout << "current number of blocks " << current_number_of_blocks << '\n';
start = std::chrono::high_resolution_clock::now();
}
init_lookback_scan_state_kernel<offset_scan_state_type>
<<<dim3(grid_size), dim3(block_size), 0, stream>>>(
offset_scan_state, current_number_of_blocks, ordered_bid
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_offset_scan_state_kernel", current_number_of_blocks, start)
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
grid_size = current_number_of_blocks;
partition_kernel<
SelectMethod, OnlySelected, config
>
<<<dim3(grid_size), dim3(block_size), 0, stream>>>(
keys_input + offset, values_input + offset, flags + offset, keys_output, values_output, selected_count, prev_selected_count,
current_size, inequality_op, offset_scan_state, current_number_of_blocks, ordered_bid, predicates...
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("partition_kernel", size, start)
std::swap(selected_count, prev_selected_count);
}
error = ::rocprim::transform(
prev_selected_count, selected_count_output, (is_three_way ? 2 : 1),
::rocprim::identity<>{},
stream, debug_synchronous
);
if (error != cudaSuccess) return error;
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#undef ROCPRIM_DETAIL_HIP_SYNC
} // end of detail namespace
/// \brief Parallel select primitive for device level using range of flags.
///
/// Performs a device-wide partition based on input \p flags. Partition copies
/// the values from \p input to \p output in such a way that all values for which the corresponding
/// items from /p flags are \p true (or can be implicitly converted to \p true) precede
/// the elements for which the corresponding items from /p flags are \p false.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input, \p flags and \p output must have at least \p size elements.
/// * Range specified by \p selected_count_output must have at least 1 element.
/// * Values of \p flag range should be implicitly convertible to `bool` type.
/// * Relative order is preserved for the elements for which the corresponding values from \p flags
/// are \p true. Other elements are copied in reverse order.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam FlagIterator - random-access iterator type of the flag range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [in] flags - iterator to the selection flag corresponding to the first element from \p input range.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level partition operation is performed on an array of
/// integer values with array of <tt>char</tt>s used as flags.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// char * flags; // e.g., [0, 1, 1, 0, 0, 1, 0, 1]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::partition(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, flags,
/// output, output_count,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform partition
/// rocprim::partition(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, flags,
/// output, output_count,
/// input_size
/// );
/// // output: [2, 3, 6, 8, 7, 5, 4, 1]
/// // output_count: 4
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class FlagIterator,
class OutputIterator,
class SelectedCountOutputIterator
>
inline
cudaError_t partition(void * temporary_storage,
size_t& storage_size,
InputIterator input,
FlagIterator flags,
OutputIterator output,
SelectedCountOutputIterator selected_count_output,
const size_t size,
const cudaStream_t stream = 0,
const bool debug_synchronous = false)
{
// Dummy unary predicate
using unary_predicate_type = ::rocprim::empty_type;
// Dummy inequality operation
using inequality_op_type = ::rocprim::empty_type;
using offset_type = unsigned int;
rocprim::empty_type* const no_values = nullptr; // key only
return detail::partition_impl<detail::select_method::flag, false, Config, offset_type>(
temporary_storage, storage_size, input, no_values, flags, output, no_values, selected_count_output,
size, inequality_op_type(), stream, debug_synchronous, unary_predicate_type()
);
}
/// \brief Parallel select primitive for device level using selection predicate.
///
/// Performs a device-wide partition using selection predicate. Partition copies
/// the values from \p input to \p output in such a way that all values for which
/// the \p predicate returns \p true precede the elements for which it returns \p false.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input, \p flags and \p output must have at least \p size elements.
/// * Range specified by \p selected_count_output must have at least 1 element.
/// * Relative order is preserved for the elements for which the \p predicate returns \p true. Other
/// elements are copied in reverse order.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
/// \tparam UnaryPredicate - type of a unary selection predicate.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] predicate - unary function object which returns /p true if the element should be
/// ordered before other elements.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level partition operation is performed on an array of
/// integer values, even values are copied before odd values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>///
///
/// auto predicate =
/// [] __device__ (int a) -> bool
/// {
/// return (a%2) == 0;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::partition(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input,
/// output, output_count,
/// input_size,
/// predicate
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform partition
/// rocprim::partition(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input,
/// output, output_count,
/// input_size,
/// predicate
/// );
/// // output: [2, 4, 6, 8, 7, 5, 3, 1]
/// // output_count: 4
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class SelectedCountOutputIterator,
class UnaryPredicate
>
inline
cudaError_t partition(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
SelectedCountOutputIterator selected_count_output,
const size_t size,
UnaryPredicate predicate,
const cudaStream_t stream = 0,
const bool debug_synchronous = false)
{
// Dummy flag type
using flag_type = ::rocprim::empty_type;
flag_type * flags = nullptr;
// Dummy inequality operation
using inequality_op_type = ::rocprim::empty_type;
using offset_type = unsigned int;
rocprim::empty_type* const no_values = nullptr; // key only
return detail::partition_impl<detail::select_method::predicate, false, Config, offset_type>(
temporary_storage, storage_size, input, no_values, flags, output, no_values, selected_count_output,
size, inequality_op_type(), stream, debug_synchronous, predicate
);
}
/// \brief Parallel select primitive for device level using two selection predicates.
///
/// Performs a device-wide three-way partition using two selection predicates. Partition copies
/// the values from \p input to either \p output_first_part or \p output_second_part or
/// \p output_unselected according to the following criteria:
/// The value is copied to \p output_first_part if the predicate \p select_first_part_op invoked
/// with the value returns \p true. It is copied to \p output_second_part if \p select_first_part_op
/// returns \p false and \p select_second_part_op returns \p true, and it is copied to
/// \p output_unselected otherwise.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage is a null pointer.
/// * Range specified by \p selected_count_output must have at least 2 elements.
/// * Relative order is preserved for the elements.
/// * The number of elements written to \p output_first_part is equal to the number of elements
/// in the input for which \p select_first_part_op returned \p true.
/// * The number of elements written to \p output_second_part is equal to the number of elements
/// in the input for which \p select_first_part_op returned \p false and \p select_second_part_op
/// returned \p true.
/// * The number of elements written to \p output_unselected is equal to the number of input elements
/// minus the number of elements written to \p output_first_part minus the number of elements written
/// to \p output_second_part.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam FirstOutputIterator - random-access iterator type of the first output range. It can be
/// a simple pointer type.
/// \tparam SecondOutputIterator - random-access iterator type of the second output range. It can be
/// a simple pointer type.
/// \tparam UnselectedOutputIterator - random-access iterator type of the unselected output range.
/// It can be a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
/// \tparam FirstUnaryPredicate - type of the first unary selection predicate.
/// \tparam SecondUnaryPredicate - type of the second unary selection predicate.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [out] output_first_part - iterator to the first element in the first output range.
/// \param [out] output_second_part - iterator to the first element in the second output range.
/// \param [out] output_unselected - iterator to the first element in the unselected output range.
/// \param [out] selected_count_output - iterator to the total number of selected values in
/// \p output_first_part and \p output_second_part respectively.
/// \param [in] size - number of element in the input range.
/// \param [in] select_first_part_op - unary function object which returns \p true if the element
/// should be in \p output_first_part range
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] select_second_part_op - unary function object which returns \p true if the element
/// should be in \p output_second_part range (given that \p select_first_part_op returned \p false)
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level three-way partition operation is performed on an array of
/// integer values, even values are copied to the first partition, odd and 3-divisible values
/// are copied to the second partition, and the rest of the values are copied to the
/// unselected partition
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// auto first_predicate =
/// [] __device__ (int a) -> bool
/// {
/// return (a%2) == 0;
/// };
/// auto second_predicate =
/// [] __device__ (int a) -> bool
/// {
/// return (a%3) == 0;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output_first_part; // array of 8 elements
/// int * output_second_part; // array of 8 elements
/// int * output_unselected; // array of 8 elements
/// size_t * output_count; // array of 2 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::partition_three_way(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input,
/// output_first_part, output_second_part, output_unselected,
/// output_count,
/// input_size,
/// first_predicate,
/// second_predicate
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform partition
/// rocprim::partition_three_way(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input,
/// output_first_part, output_second_part, output_unselected,
/// output_count,
/// input_size,
/// first_predicate,
/// second_predicate
/// );
/// // elements denoted by '*' were not modified
/// // output_first_part: [2, 4, 6, 8, *, *, *, *]
/// // output_second_part: [3, *, *, *, *, *, *, *]
/// // output_unselected: [1, 5, 7, *, *, *, *, *]
/// // output_count: [4, 1]
/// \endcode
/// \endparblock
template <
class Config = default_config,
typename InputIterator,
typename FirstOutputIterator,
typename SecondOutputIterator,
typename UnselectedOutputIterator,
typename SelectedCountOutputIterator,
typename FirstUnaryPredicate,
typename SecondUnaryPredicate>
inline
cudaError_t partition_three_way(void * temporary_storage,
size_t& storage_size,
InputIterator input,
FirstOutputIterator output_first_part,
SecondOutputIterator output_second_part,
UnselectedOutputIterator output_unselected,
SelectedCountOutputIterator selected_count_output,
const size_t size,
FirstUnaryPredicate select_first_part_op,
SecondUnaryPredicate select_second_part_op,
const cudaStream_t stream = 0,
const bool debug_synchronous = false)
{
// Dummy flag type
using flag_type = ::rocprim::empty_type;
flag_type * flags = nullptr;
// Dummy inequality operation
using inequality_op_type = ::rocprim::empty_type;
using offset_type = uint2;
using output_key_iterator_tuple = tuple<
FirstOutputIterator,
SecondOutputIterator,
UnselectedOutputIterator>;
using output_value_iterator_tuple
= tuple<::rocprim::empty_type*, ::rocprim::empty_type*, ::rocprim::empty_type*>;
rocprim::empty_type* const no_input_values = nullptr; // key only
const output_value_iterator_tuple no_output_values {nullptr, nullptr, nullptr}; // key only
output_key_iterator_tuple output{ output_first_part, output_second_part, output_unselected };
return detail::partition_impl<detail::select_method::predicate, false, Config, offset_type>(
temporary_storage, storage_size, input, no_input_values, flags, output, no_output_values, selected_count_output,
size, inequality_op_type(), stream, debug_synchronous,
select_first_part_op, select_second_part_op
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
#define ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
#include <iostream>
#include <iterator>
#include <type_traits>
#include <utility>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/radix_sort.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "device_radix_sort_config.hpp"
#include "device_transform.hpp"
#include "detail/device_radix_sort.hpp"
#include "specialization/device_radix_single_sort.hpp"
#include "specialization/device_radix_merge_sort.hpp"
/// \addtogroup devicemodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int RadixBits,
bool Descending,
class KeysInputIterator,
class Offset
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void fill_digit_counts_kernel(KeysInputIterator keys_input,
Offset size,
Offset * batch_digit_counts,
unsigned int bit,
unsigned int current_radix_bits,
unsigned int blocks_per_full_batch,
unsigned int full_batches)
{
fill_digit_counts<BlockSize, ItemsPerThread, RadixBits, Descending>(
keys_input, size,
batch_digit_counts,
bit, current_radix_bits,
blocks_per_full_batch, full_batches
);
}
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int RadixBits,
class Offset
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void scan_batches_kernel(Offset * batch_digit_counts,
Offset * digit_counts,
unsigned int batches)
{
scan_batches<BlockSize, ItemsPerThread, RadixBits>(batch_digit_counts, digit_counts, batches);
}
template<
unsigned int RadixBits,
class Offset
>
ROCPRIM_KERNEL
__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
void scan_digits_kernel(Offset * digit_counts)
{
scan_digits<RadixBits>(digit_counts);
}
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int RadixBits,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class Offset
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void sort_and_scatter_kernel(KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
Offset size,
const Offset * batch_digit_starts,
const Offset * digit_starts,
unsigned int bit,
unsigned int current_radix_bits,
unsigned int blocks_per_full_batch,
unsigned int full_batches)
{
sort_and_scatter<BlockSize, ItemsPerThread, RadixBits, Descending>(
keys_input, keys_output, values_input, values_output, size,
batch_digit_starts, digit_starts,
bit, current_radix_bits,
blocks_per_full_batch, full_batches
);
}
#ifndef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
#endif
template<
class Config,
unsigned int RadixBits,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class Offset
>
inline
cudaError_t radix_sort_iteration(KeysInputIterator keys_input,
typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
ValuesOutputIterator values_output,
Offset size,
Offset * batch_digit_counts,
Offset * digit_counts,
bool from_input,
bool to_output,
unsigned int bit,
unsigned int end_bit,
unsigned int blocks_per_full_batch,
unsigned int full_batches,
unsigned int batches,
cudaStream_t stream,
bool debug_synchronous)
{
constexpr unsigned int radix_size = 1 << RadixBits;
// Handle cases when (end_bit - bit) is not divisible by RadixBits, i.e. the last
// iteration has a shorter mask.
const unsigned int current_radix_bits = ::rocprim::min(RadixBits, end_bit - bit);
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous)
{
std::cout << "RadixBits " << RadixBits << '\n';
std::cout << "bit " << bit << '\n';
std::cout << "current_radix_bits " << current_radix_bits << '\n';
}
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
if(from_input)
{
fill_digit_counts_kernel<
Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
>
<<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
keys_input, size,
batch_digit_counts,
bit, current_radix_bits,
blocks_per_full_batch, full_batches
);
}
else
{
if(to_output)
{
fill_digit_counts_kernel<
Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
>
<<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
keys_tmp, size,
batch_digit_counts,
bit, current_radix_bits,
blocks_per_full_batch, full_batches
);
}
else
{
fill_digit_counts_kernel<
Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
>
<<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
keys_output, size,
batch_digit_counts,
bit, current_radix_bits,
blocks_per_full_batch, full_batches
);
}
}
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("fill_digit_counts", size, start)
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
scan_batches_kernel<Config::scan::block_size, Config::scan::items_per_thread, RadixBits>
<<<dim3(radix_size), dim3(Config::scan::block_size), 0, stream>>>(
batch_digit_counts, digit_counts, batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_batches", radix_size * Config::scan::block_size, start)
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
scan_digits_kernel<RadixBits>
<<<dim3(1), dim3(radix_size), 0, stream>>>(
digit_counts
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_digits", radix_size, start)
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
if(from_input)
{
if(to_output)
{
sort_and_scatter_kernel<
Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
>
<<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
keys_input, keys_output, values_input, values_output, size,
const_cast<const Offset *>(batch_digit_counts),
const_cast<const Offset *>(digit_counts),
bit, current_radix_bits,
blocks_per_full_batch, full_batches
);
}
else
{
sort_and_scatter_kernel<
Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
>
<<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
keys_input, keys_tmp, values_input, values_tmp, size,
const_cast<const Offset *>(batch_digit_counts),
const_cast<const Offset *>(digit_counts),
bit, current_radix_bits,
blocks_per_full_batch, full_batches
);
}
}
else
{
if(to_output)
{
sort_and_scatter_kernel<
Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
>
<<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
keys_tmp, keys_output, values_tmp, values_output, size,
const_cast<const Offset *>(batch_digit_counts),
const_cast<const Offset *>(digit_counts),
bit, current_radix_bits,
blocks_per_full_batch, full_batches
);
}
else
{
sort_and_scatter_kernel<
Config::sort::block_size, Config::sort::items_per_thread, RadixBits, Descending
>
<<<dim3(batches), dim3(Config::sort::block_size), 0, stream>>>(
keys_output, keys_tmp, values_output, values_tmp, size,
const_cast<const Offset *>(batch_digit_counts),
const_cast<const Offset *>(digit_counts),
bit, current_radix_bits,
blocks_per_full_batch, full_batches
);
}
}
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("sort_and_scatter", size, start)
return cudaSuccess;
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_single_impl(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
bool& is_result_in_output,
unsigned int begin_bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
using config = default_or_custom_config<
Config,
default_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
>;
const size_t minimum_bytes = ::rocprim::detail::align_size(1);
if(temporary_storage == nullptr)
{
storage_size = minimum_bytes;
return cudaSuccess;
}
if( size == 0u )
return cudaSuccess;
if(debug_synchronous)
{
std::cout << "temporary_storage " << temporary_storage << '\n';
cudaError_t error = cudaStreamSynchronize(stream);
if(error != cudaSuccess) return error;
}
cudaError_t error = radix_sort_single<config, Descending>(
keys_input, keys_output, values_input, values_output, size,
begin_bit, end_bit,
stream, debug_synchronous
);
if(error != cudaSuccess) return error;
is_result_in_output = true;
return cudaSuccess;
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator
>
inline
cudaError_t radix_sort_merge_impl(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
ValuesOutputIterator values_output,
unsigned int size,
bool& is_result_in_output,
unsigned int begin_bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
using config = default_or_custom_config<
Config,
default_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
>;
constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
const bool with_double_buffer = keys_tmp != nullptr;
const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type));
const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0;
const size_t minimum_bytes = ::rocprim::detail::align_size(1);
if(temporary_storage == nullptr)
{
if(!with_double_buffer)
storage_size = keys_bytes + values_bytes;
else
storage_size = minimum_bytes;
return cudaSuccess;
}
if(debug_synchronous)
{
std::cout << "temporary_storage " << temporary_storage << '\n';
cudaError_t error = cudaStreamSynchronize(stream);
if(error != cudaSuccess) return error;
}
if(!with_double_buffer)
{
char * ptr = reinterpret_cast<char *>(temporary_storage);
keys_tmp = reinterpret_cast<key_type *>(ptr);
ptr += keys_bytes;
values_tmp = with_values ? reinterpret_cast<value_type *>(ptr) : nullptr;
}
cudaError_t error = radix_sort_merge<config, Descending>(
keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output, size,
begin_bit, end_bit,
stream, debug_synchronous
);
if(error != cudaSuccess) return error;
is_result_in_output = true;
return cudaSuccess;
}
template<class Size>
using offset_type_t = std::conditional_t<
sizeof(Size) <= 4,
unsigned int,
size_t
>;
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class Size
>
inline
cudaError_t radix_sort_iterations_impl(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
ValuesOutputIterator values_output,
Size size,
bool& is_result_in_output,
unsigned int begin_bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
using offset_type = offset_type_t<Size>;
using config = default_or_custom_config<
Config,
default_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
>;
constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
constexpr unsigned int max_radix_size = 1 << config::long_radix_bits;
constexpr unsigned int scan_size = config::scan::block_size * config::scan::items_per_thread;
constexpr unsigned int sort_size = config::sort::block_size * config::sort::items_per_thread;
const unsigned int blocks = static_cast<unsigned int>(::rocprim::detail::ceiling_div(size, sort_size));
const unsigned int blocks_per_full_batch = ::rocprim::detail::ceiling_div(blocks, scan_size);
const unsigned int full_batches = blocks % scan_size != 0
? blocks % scan_size
: scan_size;
const unsigned int batches = (blocks_per_full_batch == 1 ? full_batches : scan_size);
const bool with_double_buffer = keys_tmp != nullptr;
const unsigned int bits = end_bit - begin_bit;
const unsigned int iterations = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits);
const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits;
const unsigned int short_iterations = radix_bits_diff != 0
? ::rocprim::min(iterations, (config::long_radix_bits * iterations - bits) / std::max(1u, radix_bits_diff))
: 0;
const unsigned int long_iterations = iterations - short_iterations;
const size_t batch_digit_counts_bytes =
::rocprim::detail::align_size(batches * max_radix_size * sizeof(offset_type));
const size_t digit_counts_bytes = ::rocprim::detail::align_size(max_radix_size * sizeof(offset_type));
const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type));
const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0;
if(temporary_storage == nullptr)
{
storage_size = batch_digit_counts_bytes + digit_counts_bytes;
if(!with_double_buffer)
{
storage_size += keys_bytes + values_bytes;
}
return cudaSuccess;
}
if( size == 0u )
return cudaSuccess;
if(debug_synchronous)
{
std::cout << "scan_size " << scan_size << '\n';
std::cout << "sort_size " << sort_size << '\n';
std::cout << "blocks " << blocks << '\n';
std::cout << "blocks_per_full_batch " << blocks_per_full_batch << '\n';
std::cout << "full_batches " << full_batches << '\n';
std::cout << "batches " << batches << '\n';
std::cout << "iterations " << iterations << '\n';
std::cout << "long_iterations " << long_iterations << '\n';
std::cout << "short_iterations " << short_iterations << '\n';
cudaError_t error = cudaStreamSynchronize(stream);
if(error != cudaSuccess) return error;
}
char * ptr = reinterpret_cast<char *>(temporary_storage);
offset_type * batch_digit_counts = reinterpret_cast<offset_type *>(ptr);
ptr += batch_digit_counts_bytes;
offset_type * digit_counts = reinterpret_cast<offset_type *>(ptr);
ptr += digit_counts_bytes;
if(!with_double_buffer)
{
keys_tmp = reinterpret_cast<key_type *>(ptr);
ptr += keys_bytes;
values_tmp = with_values ? reinterpret_cast<value_type *>(ptr) : nullptr;
}
bool to_output = with_double_buffer || (iterations - 1) % 2 == 0;
bool from_input = true;
if(!with_double_buffer && to_output)
{
// Copy input keys and values if necessary (in-place sorting: input and output iterators are equal)
const bool keys_equal = ::rocprim::detail::are_iterators_equal(keys_input, keys_output);
const bool values_equal = with_values && ::rocprim::detail::are_iterators_equal(values_input, values_output);
if(keys_equal || values_equal)
{
cudaError_t error = ::rocprim::transform(
keys_input, keys_tmp, size,
::rocprim::identity<key_type>(), stream, debug_synchronous
);
if(error != cudaSuccess) return error;
if(with_values)
{
cudaError_t error = ::rocprim::transform(
values_input, values_tmp, size,
::rocprim::identity<value_type>(), stream, debug_synchronous
);
if(error != cudaSuccess) return error;
}
from_input = false;
}
}
unsigned int bit = begin_bit;
for(unsigned int i = 0; i < long_iterations; i++)
{
cudaError_t error = radix_sort_iteration<config, config::long_radix_bits, Descending>(
keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
static_cast<offset_type>(size), batch_digit_counts, digit_counts,
from_input, to_output,
bit, end_bit,
blocks_per_full_batch, full_batches, batches,
stream, debug_synchronous
);
if(error != cudaSuccess) return error;
is_result_in_output = to_output;
from_input = false;
to_output = !to_output;
bit += config::long_radix_bits;
}
for(unsigned int i = 0; i < short_iterations; i++)
{
cudaError_t error = radix_sort_iteration<config, config::short_radix_bits, Descending>(
keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
static_cast<offset_type>(size), batch_digit_counts, digit_counts,
from_input, to_output,
bit, end_bit,
blocks_per_full_batch, full_batches, batches,
stream, debug_synchronous
);
if(error != cudaSuccess) return error;
is_result_in_output = to_output;
from_input = false;
to_output = !to_output;
bit += config::short_radix_bits;
}
return cudaSuccess;
}
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class Size
>
inline
cudaError_t radix_sort_impl(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
ValuesOutputIterator values_output,
Size size,
bool& is_result_in_output,
unsigned int begin_bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
static_assert(
std::is_same<key_type, typename std::iterator_traits<KeysOutputIterator>::value_type>::value,
"KeysInputIterator and KeysOutputIterator must have the same value_type"
);
static_assert(
std::is_same<value_type, typename std::iterator_traits<ValuesOutputIterator>::value_type>::value,
"ValuesInputIterator and ValuesOutputIterator must have the same value_type"
);
using config = default_or_custom_config<
Config,
default_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
>;
constexpr unsigned int single_sort_limit = config::sort_single::block_size * config::sort_single::items_per_thread;
constexpr unsigned int merge_sort_limit = config::sort_merge::block_size * config::sort_merge::items_per_thread * config::merge_size_limit_blocks;
if( size <= single_sort_limit )
{
return radix_sort_single_impl<Config, Descending>(
temporary_storage,
storage_size,
keys_input,
keys_output,
values_input,
values_output,
static_cast<unsigned int>(size),
is_result_in_output,
begin_bit,
end_bit,
stream,
debug_synchronous
);
}
else if( size <= merge_sort_limit )
{
return radix_sort_merge_impl<Config, Descending>(
temporary_storage,
storage_size,
keys_input,
keys_tmp,
keys_output,
values_input,
values_tmp,
values_output,
static_cast<unsigned int>(size),
is_result_in_output,
begin_bit,
end_bit,
stream,
debug_synchronous
);
}
else
{
return radix_sort_iterations_impl<Config, Descending>(
temporary_storage,
storage_size,
keys_input,
keys_tmp,
keys_output,
values_input,
values_tmp,
values_output,
size,
is_result_in_output,
begin_bit,
end_bit,
stream,
debug_synchronous
);
}
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
} // end namespace detail
/// \brief Parallel ascending radix sort primitive for device level.
///
/// \p radix_sort_keys function performs a device-wide radix sort
/// of keys. Function sorts input keys in ascending order.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
/// // keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class Size,
class Key = typename std::iterator_traits<KeysInputIterator>::value_type
>
inline
cudaError_t radix_sort_keys(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
Size size,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
empty_type * values = nullptr;
bool ignored;
return detail::radix_sort_impl<Config, false>(
temporary_storage, storage_size,
keys_input, nullptr, keys_output,
values, nullptr, values,
size, ignored,
begin_bit, end_bit,
stream, debug_synchronous
);
}
/// \brief Parallel descending radix sort primitive for device level.
///
/// \p radix_sort_keys_desc function performs a device-wide radix sort
/// of keys. Function sorts input keys in descending order.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
/// // keys_output: [8, 7, 6, 5, 4, 3, 2, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class Size,
class Key = typename std::iterator_traits<KeysInputIterator>::value_type
>
inline
cudaError_t radix_sort_keys_desc(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
Size size,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
empty_type * values = nullptr;
bool ignored;
return detail::radix_sort_impl<Config, true>(
temporary_storage, storage_size,
keys_input, nullptr, keys_output,
values, nullptr, values,
size, ignored,
begin_bit, end_bit,
stream, debug_synchronous
);
}
/// \brief Parallel ascending radix sort-by-key primitive for device level.
///
/// \p radix_sort_pairs_desc function performs a device-wide radix sort
/// of (key, value) pairs. Function sorts input pairs in ascending order of keys.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
/// have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
///
/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
/// // is set to 5.
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size, 0, 5
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size, 0, 5
/// );
/// // keys_output: [ 1, 1, 3, 4, 5, 6, 7, 8]
/// // values_output: [-1, -2, 2, 3, -4, -5, 7, -8]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class Size,
class Key = typename std::iterator_traits<KeysInputIterator>::value_type
>
inline
cudaError_t radix_sort_pairs(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
Size size,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
bool ignored;
return detail::radix_sort_impl<Config, false>(
temporary_storage, storage_size,
keys_input, nullptr, keys_output,
values_input, nullptr, values_output,
size, ignored,
begin_bit, end_bit,
stream, debug_synchronous
);
}
/// \brief Parallel descending radix sort-by-key primitive for device level.
///
/// \p radix_sort_pairs_desc function performs a device-wide radix sort
/// of (key, value) pairs. Function sorts input pairs in descending order of keys.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
/// have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed where input keys are
/// represented by an array of integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size
/// );
/// // keys_output: [ 8, 7, 6, 5, 4, 3, 1, 1]
/// // values_output: [-8, 7, -5, -4, 3, 2, -1, -2]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class Size,
class Key = typename std::iterator_traits<KeysInputIterator>::value_type
>
inline
cudaError_t radix_sort_pairs_desc(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
Size size,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
bool ignored;
return detail::radix_sort_impl<Config, true>(
temporary_storage, storage_size,
keys_input, nullptr, keys_output,
values_input, nullptr, values_output,
size, ignored,
begin_bit, end_bit,
stream, debug_synchronous
);
}
/// \brief Parallel ascending radix sort primitive for device level.
///
/// \p radix_sort_keys function performs a device-wide radix sort
/// of keys. Function sorts input keys in ascending order.
///
/// \par Overview
/// * The contents of both buffers of \p keys may be altered by the sorting function.
/// * \p current() of \p keys is used as the input.
/// * The function will update \p current() of \p keys to point to the buffer
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * tmp; // empty array of 8 elements
/// // Create double-buffer
/// rocprim::double_buffer<float> keys(input, tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size
/// );
/// // keys.current(): [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class Key,
class Size
>
inline
cudaError_t radix_sort_keys(void * temporary_storage,
size_t& storage_size,
double_buffer<Key>& keys,
Size size,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
empty_type * values = nullptr;
bool is_result_in_output;
cudaError_t error = detail::radix_sort_impl<Config, false>(
temporary_storage, storage_size,
keys.current(), keys.current(), keys.alternate(),
values, values, values,
size, is_result_in_output,
begin_bit, end_bit,
stream, debug_synchronous
);
if(temporary_storage != nullptr && is_result_in_output)
{
keys.swap();
}
return error;
}
/// \brief Parallel descending radix sort primitive for device level.
///
/// \p radix_sort_keys_desc function performs a device-wide radix sort
/// of keys. Function sorts input keys in descending order.
///
/// \par Overview
/// * The contents of both buffers of \p keys may be altered by the sorting function.
/// * \p current() of \p keys is used as the input.
/// * The function will update \p current() of \p keys to point to the buffer
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
/// int * tmp; // empty array of 8 elements
/// // Create double-buffer
/// rocprim::double_buffer<int> keys(input, tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size
/// );
/// // keys.current(): [8, 7, 6, 5, 4, 3, 2, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class Key,
class Size
>
inline
cudaError_t radix_sort_keys_desc(void * temporary_storage,
size_t& storage_size,
double_buffer<Key>& keys,
Size size,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
empty_type * values = nullptr;
bool is_result_in_output;
cudaError_t error = detail::radix_sort_impl<Config, true>(
temporary_storage, storage_size,
keys.current(), keys.current(), keys.alternate(),
values, values, values,
size, is_result_in_output,
begin_bit, end_bit,
stream, debug_synchronous
);
if(temporary_storage != nullptr && is_result_in_output)
{
keys.swap();
}
return error;
}
/// \brief Parallel ascending radix sort-by-key primitive for device level.
///
/// \p radix_sort_pairs_desc function performs a device-wide radix sort
/// of (key, value) pairs. Function sorts input pairs in ascending order of keys.
///
/// \par Overview
/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
/// * \p current() of \p keys and \p values are used as the input.
/// * The function will update \p current() of \p keys and \p values to point to buffers
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Value - value type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in,out] values - reference to the double-buffer of values, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_tmp; // empty array of 8 elements
/// double* values_tmp; // empty array of 8 elements
/// // Create double-buffers
/// rocprim::double_buffer<unsigned int> keys(keys_input, keys_tmp);
/// rocprim::double_buffer<double> values(values_input, values_tmp);
///
/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
/// // is set to 5.
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// 0, 5
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// 0, 5
/// );
/// // keys.current(): [ 1, 1, 3, 4, 5, 6, 7, 8]
/// // values.current(): [-1, -2, 2, 3, -4, -5, 7, -8]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class Key,
class Value,
class Size
>
inline
cudaError_t radix_sort_pairs(void * temporary_storage,
size_t& storage_size,
double_buffer<Key>& keys,
double_buffer<Value>& values,
Size size,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
bool is_result_in_output;
cudaError_t error = detail::radix_sort_impl<Config, false>(
temporary_storage, storage_size,
keys.current(), keys.current(), keys.alternate(),
values.current(), values.current(), values.alternate(),
size, is_result_in_output,
begin_bit, end_bit,
stream, debug_synchronous
);
if(temporary_storage != nullptr && is_result_in_output)
{
keys.swap();
values.swap();
}
return error;
}
/// \brief Parallel descending radix sort-by-key primitive for device level.
///
/// \p radix_sort_pairs_desc function performs a device-wide radix sort
/// of (key, value) pairs. Function sorts input pairs in descending order of keys.
///
/// \par Overview
/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
/// * \p current() of \p keys and \p values are used as the input.
/// * The function will update \p current() of \p keys and \p values to point to buffers
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Value - value type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in,out] values - reference to the double-buffer of values, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed where input keys are
/// represented by an array of integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// int * keys_tmp; // empty array of 8 elements
/// double * values_tmp; // empty array of 8 elements
/// // Create double-buffers
/// rocprim::double_buffer<int> keys(keys_input, keys_tmp);
/// rocprim::double_buffer<double> values(values_input, values_tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size
/// );
/// // keys.current(): [ 8, 7, 6, 5, 4, 3, 1, 1]
/// // values.current(): [-8, 7, -5, -4, 3, 2, -1, -2]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class Key,
class Value,
class Size
>
inline
cudaError_t radix_sort_pairs_desc(void * temporary_storage,
size_t& storage_size,
double_buffer<Key>& keys,
double_buffer<Value>& values,
Size size,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
static_assert(std::is_integral<Size>::value, "Size must be an integral type.");
bool is_result_in_output;
cudaError_t error = detail::radix_sort_impl<Config, true>(
temporary_storage, storage_size,
keys.current(), keys.current(), keys.alternate(),
values.current(), values.current(), values.alternate(),
size, is_result_in_output,
begin_bit, end_bit,
stream, debug_synchronous
);
if(temporary_storage != nullptr && is_result_in_output)
{
keys.swap();
values.swap();
}
return error;
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group devicemodule
#endif // ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
// Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_RADIX_SORT_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_RADIX_SORT_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level radix sort operation.
///
/// Radix sort is excecuted in a single tile (at size < BlocksPerItem) or
/// few iterations (passes) depending on total number of bits to be sorted
/// (\p begin_bit and \p end_bit), each iteration sorts either \p LongRadixBits or \p ShortRadixBits bits
/// choosen to cover whole bit range in optimal way.
///
/// For example, if \p LongRadixBits is 7, \p ShortRadixBits is 6, \p begin_bit is 0 and \p end_bit is 32
/// there will be 5 iterations: 7 + 7 + 6 + 6 + 6 = 32 bits.
///
/// \tparam LongRadixBits - number of bits in long iterations.
/// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits.
/// \tparam ScanConfig - configuration of digits scan kernel. Must be \p kernel_config.
/// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config.
template<
unsigned int LongRadixBits,
unsigned int ShortRadixBits,
class ScanConfig,
class SortConfig,
class SortSingleConfig = kernel_config<256, 10>,
class SortMergeConfig = kernel_config<1024, 1>,
unsigned int MergeSizeLimitBlocks = 1024U,
bool ForceSingleKernelConfig = false
>
struct radix_sort_config
{
/// \brief Number of bits in long iterations.
static constexpr unsigned int long_radix_bits = LongRadixBits;
/// \brief Number of bits in short iterations.
static constexpr unsigned int short_radix_bits = ShortRadixBits;
/// \brief Limit number of blocks to use merge kernel.
static constexpr unsigned int merge_size_limit_blocks = MergeSizeLimitBlocks;
/// \brief Configuration of digits scan kernel.
using scan = ScanConfig;
/// \brief Configuration of radix sort kernel.
using sort = SortConfig;
/// \brief Configuration of radix sort single kernel.
using sort_single = SortSingleConfig;
/// \brief Configuration of radix sort merge kernel.
using sort_merge = SortMergeConfig;
/// \brief Force use radix sort single kernel configuration.
static constexpr bool force_single_kernel_config = ForceSingleKernelConfig;
};
namespace detail
{
template<class Key, class Value>
struct radix_sort_config_803
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
using scan = kernel_config<256, 2>;
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
radix_sort_config<
8, 7, scan,
kernel_config<256, 10>, kernel_config<256, 19>
>
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
radix_sort_config<
8, 7, scan,
kernel_config<256, 10>, kernel_config<256, 17>
>
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
radix_sort_config<
7, 6, scan,
kernel_config<256, 15>, kernel_config<256, 13>
>
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
radix_sort_config<
7, 6, scan,
kernel_config<256, 13>, kernel_config<256, 10>
>
>,
radix_sort_config<
6, 4, scan,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 15u / item_scale)
>,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 10u / item_scale)
>,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 10u / item_scale)
>
>
>;
};
template<class Key>
struct radix_sort_config_803<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, radix_sort_config<8, 7, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
select_type_case<sizeof(Key) == 2, radix_sort_config<8, 7, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 16> > >,
select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 9>, kernel_config<256, 15> > >,
select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 7>, kernel_config<256, 12> > >
> { };
template<class Key, class Value>
struct radix_sort_config_900
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
using scan = kernel_config<256, 2>;
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
radix_sort_config<4, 4, scan,
kernel_config<256, 10>, kernel_config<256, 19> >
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
radix_sort_config<6, 5, scan,
kernel_config<256, 10>, kernel_config<256, 17> >
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
radix_sort_config<7, 6, scan,
kernel_config<256, 15>, kernel_config<256, 15> >
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
radix_sort_config<7, 6, scan,
kernel_config<256, 15>, kernel_config<256, 12> >
>,
radix_sort_config<
6, 4, scan,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 15u / item_scale)
>,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 10u / item_scale)
>,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 10u / item_scale)
>
>
>;
};
template<class Key>
struct radix_sort_config_900<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, radix_sort_config<4, 3, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
select_type_case<sizeof(Key) == 2, radix_sort_config<6, 5, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 16> > >,
select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 17>, kernel_config<256, 15> > >,
select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 15>, kernel_config<256, 12> > >
> { };
template<class Key, class Value>
struct radix_sort_config_908
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
using scan = kernel_config<256, 2>;
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
radix_sort_config<4, 4, scan,
kernel_config<256, 10>, kernel_config<256, 19> >
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
radix_sort_config<6, 5, scan,
kernel_config<256, 10>, kernel_config<256, 17> >
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
radix_sort_config<7, 6, kernel_config<256, 4>,
kernel_config<256, 15>, kernel_config<256, 15> >
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
radix_sort_config<7, 6, kernel_config<256, 4>,
kernel_config<256, 14>, kernel_config<256, 12> >
>,
radix_sort_config<
6, 4, scan,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 15u / item_scale)
>,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 10u / item_scale)
>,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 10u / item_scale)
>
>
>;
};
template<class Key>
struct radix_sort_config_908<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, radix_sort_config<4, 3, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
select_type_case<sizeof(Key) == 2, radix_sort_config<6, 5, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 17> > >,
select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 17>, kernel_config<256, 15> > >,
select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 15>, kernel_config<256, 12> > >
> { };
// TODO: We need to update these parameters
template<class Key, class Value>
struct radix_sort_config_90a
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
using scan = kernel_config<256, 1>;
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
radix_sort_config<4, 4, scan,
kernel_config<256, 5>, kernel_config<256, 19> >
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
radix_sort_config<6, 5, scan,
kernel_config<256, 5>, kernel_config<256, 17> >
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
radix_sort_config<7, 6, scan,
kernel_config<256, 7>, kernel_config<256, 15> >
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
radix_sort_config<7, 6, scan,
kernel_config<256, 7>, kernel_config<256, 14> >
>,
radix_sort_config<
6, 4, scan,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 15u / item_scale)
>,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 10u / item_scale)
>
>
>;
};
template<class Key>
struct radix_sort_config_90a<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, radix_sort_config<4, 3, kernel_config<256, 1>, kernel_config<256, 5>, kernel_config<256, 19> > >,
select_type_case<sizeof(Key) == 2, radix_sort_config<6, 5, kernel_config<256, 1>, kernel_config<256, 5>, kernel_config<256, 17> > >,
select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 1>, kernel_config<256, 8>, kernel_config<256, 15> > >,
select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 1>, kernel_config<256, 7>, kernel_config<256, 14> > >
> { };
// TODO: We need to update these parameters
template<class Key, class Value>
struct radix_sort_config_1030
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
using scan = kernel_config<256, 2>;
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
radix_sort_config<4, 4, scan,
kernel_config<256, 10>, kernel_config<256, 19> >
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
radix_sort_config<6, 5, scan,
kernel_config<256, 10>, kernel_config<256, 17> >
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
radix_sort_config<7, 6, scan,
kernel_config<256, 15>, kernel_config<256, 15> >
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
radix_sort_config<7, 6, scan,
kernel_config<256, 15>, kernel_config<256, 14> >
>,
radix_sort_config<
6, 4, scan,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
::rocprim::max(1u, 15u / item_scale)
>,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 10u / item_scale)
>,
kernel_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 10u / item_scale)
>
>
>;
};
template<class Key>
struct radix_sort_config_1030<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, radix_sort_config<4, 3, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
select_type_case<sizeof(Key) == 2, radix_sort_config<6, 5, kernel_config<256, 2>, kernel_config<256, 10>, kernel_config<256, 19> > >,
select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 17>, kernel_config<256, 17> > >,
select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 15>, kernel_config<256, 15> > >
> { };
template<unsigned int TargetArch, class Key, class Value>
struct default_radix_sort_config
: select_arch<
TargetArch,
select_arch_case<803, radix_sort_config_803<Key, Value> >,
select_arch_case<900, radix_sort_config_900<Key, Value> >,
select_arch_case<908, radix_sort_config_908<Key, Value> >,
select_arch_case<ROCPRIM_ARCH_90a, radix_sort_config_90a<Key, Value> >,
select_arch_case<1030, radix_sort_config_1030<Key, Value> >,
radix_sort_config_900<Key, Value>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_RADIX_SORT_CONFIG_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
#define ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
#include <type_traits>
#include <iterator>
#include <algorithm>
#include <chrono>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "device_reduce_config.hpp"
#include "detail/device_reduce.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
template<
bool WithInitialValue,
class Config,
class ResultType,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
void block_reduce_kernel(InputIterator input,
const size_t size,
OutputIterator output,
InitValueType initial_value,
BinaryFunction reduce_op)
{
block_reduce_kernel_impl<WithInitialValue, Config, ResultType>(
input, size, output, initial_value, reduce_op
);
}
#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto _error = cudaStreamSynchronize(stream); \
if(_error != cudaSuccess) return _error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template<
bool WithInitialValue, // true when inital_value should be used in reduction
class Config,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction
>
inline
cudaError_t reduce_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
BinaryFunction reduce_op,
const cudaStream_t stream,
bool debug_synchronous)
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using result_type = typename ::rocprim::detail::match_result_type<
input_type, BinaryFunction
>::type;
// Get default config if Config is default_config
using config = default_or_custom_config<
Config,
default_reduce_config<ROCPRIM_TARGET_ARCH, result_type>
>;
constexpr unsigned int block_size = config::block_size;
constexpr unsigned int items_per_thread = config::items_per_thread;
constexpr auto items_per_block = block_size * items_per_thread;
if(temporary_storage == nullptr)
{
storage_size = reduce_get_temporary_storage_bytes<result_type>(size, items_per_block);
// Make sure user won't try to allocate 0 bytes memory
storage_size = storage_size == 0 ? 4 : storage_size;
return cudaSuccess;
}
// Start point for time measurements
std::chrono::high_resolution_clock::time_point start;
static constexpr auto size_limit = config::size_limit;
static constexpr auto number_of_blocks_limit = ::rocprim::max<size_t>(size_limit / items_per_block, 1);
auto number_of_blocks = (size + items_per_block - 1)/items_per_block;
if(debug_synchronous)
{
std::cout << "block_size " << block_size << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "number of blocks limit " << number_of_blocks_limit << '\n';
std::cout << "items_per_block " << items_per_block << '\n';
}
if(number_of_blocks > 1)
{
// Pointer to array with block_prefixes
result_type * block_prefixes = static_cast<result_type*>(temporary_storage);
static constexpr auto aligned_size_limit = number_of_blocks_limit * items_per_block;
// Launch number_of_blocks_limit blocks while there is still at least as many blocks left as the limit
const auto number_of_launch = (size + aligned_size_limit - 1) / aligned_size_limit;
for(size_t i = 0, offset = 0; i < number_of_launch; ++i, offset += aligned_size_limit) {
const auto current_size = std::min<size_t>(size - offset, aligned_size_limit);
const auto current_blocks = (current_size + items_per_block - 1) / items_per_block;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
detail::block_reduce_kernel<false, config, result_type>
<<<dim3(current_blocks),
dim3(block_size),
0,
stream>>>(
input + offset,
current_size,
block_prefixes + i * number_of_blocks_limit,
initial_value,
reduce_op);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", current_size, start);
}
void * nested_temp_storage = static_cast<void*>(block_prefixes + number_of_blocks);
auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(result_type));
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
auto error = reduce_impl<WithInitialValue, config>(
nested_temp_storage,
nested_temp_storage_size,
block_prefixes, // input
output, // output
initial_value,
number_of_blocks, // input size
reduce_op,
stream,
debug_synchronous
);
if(error != cudaSuccess) return error;
ROCPRIM_DETAIL_HIP_SYNC("nested_device_reduce", number_of_blocks, start);
}
else
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
detail::block_reduce_kernel<WithInitialValue, config, result_type>
<<<dim3(1), dim3(block_size), 0, stream>>>(
input, size, output, initial_value, reduce_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", size, start);
}
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#undef ROCPRIM_DETAIL_HIP_SYNC
} // end of detail namespace
/// \brief Parallel reduction primitive for device level.
///
/// reduce function performs a device-wide reduction operation
/// using binary \p reduce_op operator.
///
/// \par Overview
/// * Does not support non-commutative reduction operators. Reduction operator should also be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input must have at least \p size elements, while \p output
/// only needs one element.
/// * By default, the input type is used for accumulation. A custom type
/// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for reduction. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the reduction operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to reduce.
/// \param [out] output - iterator to the first element in the output range. It can be
/// same as \p input.
/// \param [in] initial_value - initial value to start the reduction.
/// \param [in] size - number of element in the input range.
/// \param [in] reduce_op - binary operation function object that will be used for reduction.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level min-reduction operation is performed on an array of
/// integer values (<tt>short</tt>s are reduced into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom reduce function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 1 element
/// int start_value; // e.g., 9
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, start_value, input_size, min_op
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform reduce
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, start_value, input_size, min_op
/// );
/// // output: [1]
/// \endcode
///
/// The same example as above, but now a custom accumulator type is specified.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// size_t input_size;
/// short * input;
/// int * output;
/// int start_value;
///
/// // Use a transform iterator to specifiy a custom accumulator type
/// auto input_iterator = rocprim::make_transform_iterator(
/// input, [] __device__ (T in) { return static_cast<int>(in); });
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Use the transform iterator
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, start_value, input_size, min_op
/// );
///
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, start_value, input_size, min_op
/// );
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
>
inline
cudaError_t reduce(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
BinaryFunction reduce_op = BinaryFunction(),
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return detail::reduce_impl<true, Config>(
temporary_storage, storage_size,
input, output, initial_value, size,
reduce_op, stream, debug_synchronous
);
}
/// \brief Parallel reduce primitive for device level.
///
/// reduce function performs a device-wide reduction operation
/// using binary \p reduce_op operator.
///
/// \par Overview
/// * Does not support non-commutative reduction operators. Reduction operator should also be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input must have at least \p size elements, while \p output
/// only needs one element.
/// * By default, the input type is used for accumulation. A custom type
/// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for reduction. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the reduction operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to reduce.
/// \param [out] output - iterator to the first element in the output range. It can be
/// same as \p input.
/// \param [in] size - number of element in the input range.
/// \param [in] reduce_op - binary operation function object that will be used for reduction.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level sum operation is performed on an array of
/// integer values (<tt>short</tt>s are reduced into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size, rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform reduce
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size, rocprim::plus<int>()
/// );
/// // output: [36]
/// \endcode
///
/// The same example as above, but now a custom accumulator type is specified.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// size_t input_size;
/// short * input;
/// int * output;
///
/// // Use a transform iterator to specifiy a custom accumulator type
/// auto input_iterator = rocprim::make_transform_iterator(
/// input, [] __device__ (T in) { return static_cast<int>(in); });
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Use the transform iterator
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, start_value, input_size, rocprim::plus<int>()
/// );
///
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, start_value, input_size, rocprim::plus<int>()
/// );
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
>
inline
cudaError_t reduce(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const size_t size,
BinaryFunction reduce_op = BinaryFunction(),
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
return detail::reduce_impl<false, Config>(
temporary_storage, storage_size,
input, output, input_type(), size,
reduce_op, stream, debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_
#define ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_
#include <iterator>
#include <iostream>
#include <chrono>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "../functional.hpp"
#include "device_reduce_by_key_config.hpp"
#include "detail/device_reduce_by_key.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
class KeysInputIterator,
class KeyCompareFunction
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void fill_unique_counts_kernel(KeysInputIterator keys_input,
unsigned int size,
unsigned int * unique_counts,
KeyCompareFunction key_compare_op,
unsigned int blocks_per_full_batch,
unsigned int full_batches)
{
fill_unique_counts<BlockSize, ItemsPerThread>(
keys_input, size,
unique_counts,
key_compare_op,
blocks_per_full_batch, full_batches
);
}
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
class UniqueCountOutputIterator
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void scan_unique_counts_kernel(unsigned int * unique_counts,
UniqueCountOutputIterator unique_count_output,
unsigned int batches)
{
scan_unique_counts<BlockSize, ItemsPerThread>(unique_counts, unique_count_output, batches);
}
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
class KeysInputIterator,
class ValuesInputIterator,
class Result,
class UniqueOutputIterator,
class AggregatesOutputIterator,
class KeyCompareFunction,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void reduce_by_key_kernel(KeysInputIterator keys_input,
ValuesInputIterator values_input,
unsigned int size,
const unsigned int * unique_starts,
carry_out<Result> * carry_outs,
Result * leading_aggregates,
UniqueOutputIterator unique_output,
AggregatesOutputIterator aggregates_output,
KeyCompareFunction key_compare_op,
BinaryFunction reduce_op,
unsigned int blocks_per_full_batch,
unsigned int full_batches)
{
reduce_by_key<BlockSize, ItemsPerThread>(
keys_input, values_input, size,
unique_starts, carry_outs, leading_aggregates,
unique_output, aggregates_output,
key_compare_op, reduce_op,
blocks_per_full_batch, full_batches
);
}
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
class Result,
class AggregatesOutputIterator,
class BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void scan_and_scatter_carry_outs_kernel(const carry_out<Result> * carry_outs,
const Result * leading_aggregates,
AggregatesOutputIterator aggregates_output,
BinaryFunction reduce_op,
unsigned int batches)
{
scan_and_scatter_carry_outs<BlockSize, ItemsPerThread>(
carry_outs, leading_aggregates, aggregates_output,
reduce_op,
batches
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template<
class Config,
class KeysInputIterator,
class ValuesInputIterator,
class UniqueOutputIterator,
class AggregatesOutputIterator,
class UniqueCountOutputIterator,
class BinaryFunction,
class KeyCompareFunction
>
inline
cudaError_t reduce_by_key_impl(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
ValuesInputIterator values_input,
const unsigned int size,
UniqueOutputIterator unique_output,
AggregatesOutputIterator aggregates_output,
UniqueCountOutputIterator unique_count_output,
BinaryFunction reduce_op,
KeyCompareFunction key_compare_op,
const cudaStream_t stream,
const bool debug_synchronous)
{
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using result_type = typename ::rocprim::detail::match_result_type<
typename std::iterator_traits<ValuesInputIterator>::value_type,
BinaryFunction
>::type;
using carry_out_type = carry_out<result_type>;
using config = default_or_custom_config<
Config,
default_reduce_by_key_config<ROCPRIM_TARGET_ARCH, key_type, result_type>
>;
constexpr unsigned int items_per_block = config::reduce::block_size * config::reduce::items_per_thread;
constexpr unsigned int scan_items_per_block = config::scan::block_size * config::scan::items_per_thread;
const unsigned int blocks = std::max(1u, ::rocprim::detail::ceiling_div(size, items_per_block));
const unsigned int blocks_per_full_batch = ::rocprim::detail::ceiling_div(blocks, scan_items_per_block);
const unsigned int full_batches = blocks % scan_items_per_block != 0
? blocks % scan_items_per_block
: scan_items_per_block;
const unsigned int batches = (blocks_per_full_batch == 1 ? full_batches : scan_items_per_block);
const size_t unique_counts_bytes = ::rocprim::detail::align_size(batches * sizeof(unsigned int));
const size_t carry_outs_bytes = ::rocprim::detail::align_size(batches * sizeof(carry_out_type));
const size_t leading_aggregates_bytes = ::rocprim::detail::align_size(batches * sizeof(result_type));
if(temporary_storage == nullptr)
{
storage_size = unique_counts_bytes + carry_outs_bytes + leading_aggregates_bytes;
return cudaSuccess;
}
if(debug_synchronous)
{
std::cout << "blocks " << blocks << '\n';
std::cout << "blocks_per_full_batch " << blocks_per_full_batch << '\n';
std::cout << "full_batches " << full_batches << '\n';
std::cout << "batches " << batches << '\n';
std::cout << "storage_size " << storage_size << '\n';
cudaError_t error = cudaStreamSynchronize(stream);
if(error != cudaSuccess) return error;
}
char * ptr = reinterpret_cast<char *>(temporary_storage);
unsigned int * unique_counts = reinterpret_cast<unsigned int *>(ptr);
ptr += unique_counts_bytes;
carry_out_type * carry_outs = reinterpret_cast<carry_out_type *>(ptr);
ptr += carry_outs_bytes;
result_type * leading_aggregates = reinterpret_cast<result_type *>(ptr);
// Start point for time measurements
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
fill_unique_counts_kernel<config::reduce::block_size, config::reduce::items_per_thread>
<<<dim3(batches), dim3(config::reduce::block_size), 0, stream>>>(
keys_input, size, unique_counts, key_compare_op,
blocks_per_full_batch, full_batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("fill_unique_counts", size, start)
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
scan_unique_counts_kernel<config::scan::block_size, config::scan::items_per_thread>
<<<dim3(1), dim3(config::scan::block_size), 0, stream>>>(
unique_counts, unique_count_output,
batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_unique_counts", config::scan::block_size, start)
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
reduce_by_key_kernel<config::reduce::block_size, config::reduce::items_per_thread>
<<<dim3(batches), dim3(config::reduce::block_size), 0, stream>>>(
keys_input, values_input, size,
const_cast<const unsigned int *>(unique_counts), carry_outs, leading_aggregates,
unique_output, aggregates_output,
key_compare_op, reduce_op,
blocks_per_full_batch, full_batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("reduce_by_key", size, start)
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
scan_and_scatter_carry_outs_kernel<config::scan::block_size, config::scan::items_per_thread>
<<<dim3(1), dim3(config::scan::block_size), 0, stream>>>(
const_cast<const carry_out_type *>(carry_outs), const_cast<const result_type *>(leading_aggregates),
aggregates_output,
reduce_op,
batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("scan_and_scatter_carry_outs", config::scan::block_size, start)
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
} // end of detail namespace
/// \brief Parallel reduce-by-key primitive for device level.
///
/// reduce_by_key function performs a device-wide reduction operation of groups
/// of consecutive values having the same key using binary \p reduce_op operator. The first key of each group
/// is copied to \p unique_output and reduction of the group is written to \p aggregates_output.
/// The total number of group is written to \p unique_count_output.
///
/// \par Overview
/// * Supports non-commutative reduction operators. However, a reduction operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p keys_input and \p values_input must have at least \p size elements.
/// * Range specified by \p unique_count_output must have at least 1 element.
/// * Ranges specified by \p unique_output and \p aggregates_output must have at least
/// <tt>*unique_count_output</tt> (i.e. the number of unique keys) elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_by_key_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam UniqueOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam AggregatesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam UniqueCountOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for reduction. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p ValuesInputIterator.
/// \tparam KeyCompareFunction - type of binary function used to determine keys equality. Default type
/// is \p rocprim::equal_to<T>, where \p T is a \p value_type of \p KeysInputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the reduction operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - iterator to the first element in the range of keys.
/// \param [in] values_input - iterator to the first element in the range of values to reduce.
/// \param [in] size - number of element in the input range.
/// \param [out] unique_output - iterator to the first element in the output range of unique keys.
/// \param [out] aggregates_output - iterator to the first element in the output range of reductions.
/// \param [out] unique_count_output - iterator to total number of groups.
/// \param [in] reduce_op - binary operation function object that will be used for reduction.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] key_compare_op - binary operation function object that will be used to determine keys equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is KeyCompareFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level sum operation is performed on an array of
/// integer values and integer keys.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
/// int * values_input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * unique_output; // empty array of at least 4 elements
/// int * aggregates_output; // empty array of at least 4 elements
/// int * unique_count_output; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::reduce_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input, input_size,
/// unique_output, aggregates_output, unique_count_output
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform reduction
/// rocprim::reduce_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input, input_size,
/// unique_output, aggregates_output, unique_count_output
/// );
/// // unique_output: [1, 2, 10, 88]
/// // aggregates_output: [6, 4, 18, 8]
/// // unique_count_output: [4]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class ValuesInputIterator,
class UniqueOutputIterator,
class AggregatesOutputIterator,
class UniqueCountOutputIterator,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>,
class KeyCompareFunction = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>
>
inline
cudaError_t reduce_by_key(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
ValuesInputIterator values_input,
unsigned int size,
UniqueOutputIterator unique_output,
AggregatesOutputIterator aggregates_output,
UniqueCountOutputIterator unique_count_output,
BinaryFunction reduce_op = BinaryFunction(),
KeyCompareFunction key_compare_op = KeyCompareFunction(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return detail::reduce_by_key_impl<Config>(
temporary_storage, storage_size,
keys_input, values_input, size,
unique_output, aggregates_output, unique_count_output,
reduce_op, key_compare_op,
stream, debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level reduce-by-key operation.
///
/// \tparam ScanConfig - configuration of carry-outs scan kernel. Must be \p kernel_config.
/// \tparam ReduceConfig - configuration of the main reduce-by-key kernel. Must be \p kernel_config.
template<
class ScanConfig,
class ReduceConfig
>
struct reduce_by_key_config
{
/// \brief Configuration of carry-outs scan kernel.
using scan = ScanConfig;
/// \brief Configuration of the main reduce-by-key kernel.
using reduce = ReduceConfig;
};
namespace detail
{
template<class Key, class Value>
struct reduce_by_key_config_803
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
using scan = kernel_config<256, 4>;
using type = select_type<
select_type_case<
(sizeof(Key) <= 8 && sizeof(Value) <= 8),
reduce_by_key_config<scan, kernel_config<256, 7> >
>,
reduce_by_key_config<scan, kernel_config<limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, ::rocprim::max(1u, 15u / item_scale)> >
>;
};
template<class Key, class Value>
struct reduce_by_key_config_900
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
using scan = kernel_config<256, 2>;
using type = select_type<
select_type_case<
(sizeof(Key) <= 8 && sizeof(Value) <= 8),
reduce_by_key_config<scan, kernel_config<256, 10> >
>,
reduce_by_key_config<scan, kernel_config<limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, ::rocprim::max(1u, 15u / item_scale)> >
>;
};
// TODO: We need to update these parameters
template<class Key, class Value>
struct reduce_by_key_config_90a
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
using scan = kernel_config<256, 2>;
using type = select_type<
select_type_case<
(sizeof(Key) <= 8 && sizeof(Value) <= 8),
reduce_by_key_config<scan, kernel_config<256, 10> >
>,
reduce_by_key_config<scan, kernel_config<limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, ::rocprim::max(1u, 15u / item_scale)> >
>;
};
// TODO: We need to update these parameters
template<class Key, class Value>
struct reduce_by_key_config_1030
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
using scan = kernel_config<256, 2>;
using type = select_type<
select_type_case<
(sizeof(Key) <= 8 && sizeof(Value) <= 8),
reduce_by_key_config<scan, kernel_config<256, 10> >
>,
reduce_by_key_config<scan, kernel_config<limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_32>::value, ::rocprim::max(1u, 15u / item_scale)> >
>;
};
template<unsigned int TargetArch, class Key, class Value>
struct default_reduce_by_key_config
: select_arch<
TargetArch,
select_arch_case<803, reduce_by_key_config_803<Key, Value> >,
select_arch_case<900, reduce_by_key_config_900<Key, Value> >,
select_arch_case<ROCPRIM_ARCH_90a, reduce_by_key_config_90a<Key, Value> >,
select_arch_case<1030, reduce_by_key_config_1030<Key, Value> >,
reduce_by_key_config_900<Key, Value>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_REDUCE_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../block/block_reduce.hpp"
#include "config_types.hpp"
#include "detail/device_config_helper.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
template<class Value>
struct reduce_config_803
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = reduce_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 16u / item_scale),
::rocprim::block_reduce_algorithm::using_warp_reduce
>;
};
template<class Value>
struct reduce_config_900
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = reduce_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 16u / item_scale),
::rocprim::block_reduce_algorithm::using_warp_reduce
>;
};
// TODO: We need to update these parameters
template<class Value>
struct reduce_config_90a
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = reduce_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 16u / item_scale),
::rocprim::block_reduce_algorithm::using_warp_reduce
>;
};
// TODO: We need to update these parameters
template<class Value>
struct reduce_config_1030
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = reduce_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
::rocprim::max(1u, 16u / item_scale),
::rocprim::block_reduce_algorithm::using_warp_reduce
>;
};
template<unsigned int TargetArch, class Value>
struct default_reduce_config
: select_arch<
TargetArch,
select_arch_case<803, reduce_config_803<Value>>,
select_arch_case<900, reduce_config_900<Value>>,
select_arch_case<ROCPRIM_ARCH_90a, reduce_config_90a<Value>>,
select_arch_case<1030, reduce_config_1030<Value>>,
reduce_config_900<Value>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_CONFIG_HPP_
// Copyright (c) 2018-2020 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
#define ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../iterator/constant_iterator.hpp"
#include "../iterator/counting_iterator.hpp"
#include "../iterator/discard_iterator.hpp"
#include "../iterator/zip_iterator.hpp"
#include "device_run_length_encode_config.hpp"
#include "device_reduce_by_key.hpp"
#include "device_select.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
if(error != cudaSuccess) return error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
} \
}
} // end detail namespace
/// \brief Parallel run-length encoding for device level.
///
/// run_length_encode function performs a device-wide run-length encoding of runs (groups)
/// of consecutive values. The first value of each run is copied to \p unique_output and
/// the length of the run is written to \p counts_output.
/// The total number of runs is written to \p runs_count_output.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Range specified by \p input must have at least \p size elements.
/// * Range specified by \p runs_count_output must have at least 1 element.
/// * Ranges specified by \p unique_output and \p counts_output must have at least
/// <tt>*runs_count_output</tt> (i.e. the number of runs) elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam UniqueOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam CountsOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam RunsCountOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range of values.
/// \param [in] size - number of element in the input range.
/// \param [out] unique_output - iterator to the first element in the output range of unique values.
/// \param [out] counts_output - iterator to the first element in the output range of lenghts.
/// \param [out] runs_count_output - iterator to total number of runs.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful operation; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level run-length encoding operation is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
/// int * unique_output; // empty array of at least 4 elements
/// int * counts_output; // empty array of at least 4 elements
/// int * runs_count_output; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::run_length_encode(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, input_size,
/// unique_output, counts_output, runs_count_output
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform encoding
/// rocprim::run_length_encode(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, input_size,
/// unique_output, counts_output, runs_count_output
/// );
/// // unique_output: [1, 2, 10, 88]
/// // counts_output: [3, 1, 3, 1]
/// // runs_count_output: [4]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class UniqueOutputIterator,
class CountsOutputIterator,
class RunsCountOutputIterator
>
inline
cudaError_t run_length_encode(void * temporary_storage,
size_t& storage_size,
InputIterator input,
unsigned int size,
UniqueOutputIterator unique_output,
CountsOutputIterator counts_output,
RunsCountOutputIterator runs_count_output,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using count_type = unsigned int;
using config = detail::default_or_custom_config<
Config,
detail::default_run_length_encode_config
>;
return ::rocprim::reduce_by_key<typename config::reduce_by_key>(
temporary_storage, storage_size,
input, make_constant_iterator<count_type>(1), size,
unique_output, counts_output, runs_count_output,
::rocprim::plus<count_type>(), ::rocprim::equal_to<input_type>(),
stream, debug_synchronous
);
}
/// \brief Parallel run-length encoding of non-trivial runs for device level.
///
/// run_length_encode_non_trivial_runs function performs a device-wide run-length encoding of
/// non-trivial runs (groups) of consecutive values (groups of more than one element).
/// The offset of the first value of each non-trivial run is copied to \p offsets_output and
/// the length of the run (the count of elements) is written to \p counts_output.
/// The total number of non-trivial runs is written to \p runs_count_output.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Range specified by \p input must have at least \p size elements.
/// * Range specified by \p runs_count_output must have at least 1 element.
/// * Ranges specified by \p offsets_output and \p counts_output must have at least
/// <tt>*runs_count_output</tt> (i.e. the number of non-trivial runs) elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OffsetsOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam CountsOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam RunsCountOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range of values.
/// \param [in] size - number of element in the input range.
/// \param [out] offsets_output - iterator to the first element in the output range of offsets.
/// \param [out] counts_output - iterator to the first element in the output range of lenghts.
/// \param [out] runs_count_output - iterator to total number of runs.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful operation; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level run-length encoding of non-trivial runs is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
/// int * offsets_output; // empty array of at least 2 elements
/// int * counts_output; // empty array of at least 2 elements
/// int * runs_count_output; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::run_length_encode_non_trivial_runs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, input_size,
/// offsets_output, counts_output, runs_count_output
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform encoding
/// rocprim::run_length_encode_non_trivial_runs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, input_size,
/// offsets_output, counts_output, runs_count_output
/// );
/// // offsets_output: [0, 4]
/// // counts_output: [3, 3]
/// // runs_count_output: [2]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OffsetsOutputIterator,
class CountsOutputIterator,
class RunsCountOutputIterator
>
inline
cudaError_t run_length_encode_non_trivial_runs(void * temporary_storage,
size_t& storage_size,
InputIterator input,
unsigned int size,
OffsetsOutputIterator offsets_output,
CountsOutputIterator counts_output,
RunsCountOutputIterator runs_count_output,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
using offset_type = unsigned int;
using count_type = unsigned int;
using offset_count_pair = typename ::rocprim::tuple<offset_type, count_type>;
using config = detail::default_or_custom_config<
Config,
detail::default_run_length_encode_config
>;
cudaError_t error;
auto reduce_op = [] __device__ (const offset_count_pair& a, const offset_count_pair& b)
{
return offset_count_pair(
::rocprim::get<0>(a), // Always use offset of the first item of the run
::rocprim::get<1>(a) + ::rocprim::get<1>(b) // Number of items in the run
);
};
auto non_trivial_runs_select_op = [] __device__ (const offset_count_pair& a)
{
return ::rocprim::get<1>(a) > 1;
};
offset_type * offsets_tmp = nullptr;
count_type * counts_tmp = nullptr;
count_type * all_runs_count_tmp = nullptr;
// Calculate size of temporary storage for reduce_by_key operation
size_t reduce_by_key_bytes;
error = ::rocprim::reduce_by_key<typename config::reduce_by_key>(
nullptr, reduce_by_key_bytes,
input,
::rocprim::make_zip_iterator(
::rocprim::make_tuple(
::rocprim::make_counting_iterator<offset_type>(0),
::rocprim::make_constant_iterator<count_type>(1)
)
),
size,
::rocprim::make_discard_iterator(),
::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_tmp, counts_tmp)),
all_runs_count_tmp,
reduce_op, ::rocprim::equal_to<input_type>(),
stream, debug_synchronous
);
if(error != cudaSuccess) return error;
reduce_by_key_bytes = ::rocprim::detail::align_size(reduce_by_key_bytes);
// Calculate size of temporary storage for select operation
size_t select_bytes;
error = ::rocprim::select<typename config::select>(
nullptr, select_bytes,
::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_tmp, counts_tmp)),
::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_output, counts_output)),
runs_count_output,
size,
non_trivial_runs_select_op,
stream, debug_synchronous
);
if(error != cudaSuccess) return error;
select_bytes = ::rocprim::detail::align_size(select_bytes);
const size_t offsets_tmp_bytes = ::rocprim::detail::align_size(size * sizeof(offset_type));
const size_t counts_tmp_bytes = ::rocprim::detail::align_size(size * sizeof(count_type));
const size_t all_runs_count_tmp_bytes = sizeof(count_type);
if(temporary_storage == nullptr)
{
storage_size = ::rocprim::max(reduce_by_key_bytes, select_bytes) +
offsets_tmp_bytes + counts_tmp_bytes + all_runs_count_tmp_bytes;
return cudaSuccess;
}
char * ptr = reinterpret_cast<char *>(temporary_storage);
ptr += ::rocprim::max(reduce_by_key_bytes, select_bytes);
offsets_tmp = reinterpret_cast<offset_type *>(ptr);
ptr += offsets_tmp_bytes;
counts_tmp = reinterpret_cast<count_type *>(ptr);
ptr += counts_tmp_bytes;
all_runs_count_tmp = reinterpret_cast<count_type *>(ptr);
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
error = ::rocprim::reduce_by_key<typename config::reduce_by_key>(
temporary_storage, reduce_by_key_bytes,
input,
::rocprim::make_zip_iterator(
::rocprim::make_tuple(
::rocprim::make_counting_iterator<offset_type>(0),
::rocprim::make_constant_iterator<count_type>(1)
)
),
size,
::rocprim::make_discard_iterator(), // Ignore unique output
::rocprim::make_zip_iterator(rocprim::make_tuple(offsets_tmp, counts_tmp)),
all_runs_count_tmp,
reduce_op, ::rocprim::equal_to<input_type>(),
stream, debug_synchronous
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("rocprim::reduce_by_key", size, start)
// Read count of all runs (including trivial runs)
count_type all_runs_count;
// cudaMemcpyWithStream is only supported on rocm 3.1 and above
error = cudaMemcpyAsync(&all_runs_count, all_runs_count_tmp, sizeof(count_type), cudaMemcpyDeviceToHost, stream);
if(error != cudaSuccess) return error;
error = cudaStreamSynchronize(stream);
// Select non-trivial runs
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
error = ::rocprim::select<typename config::select>(
temporary_storage, select_bytes,
::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_tmp, counts_tmp)),
::rocprim::make_zip_iterator(::rocprim::make_tuple(offsets_output, counts_output)),
runs_count_output,
all_runs_count,
non_trivial_runs_select_op,
stream, debug_synchronous
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("rocprim::select", all_runs_count, start)
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level run-length encoding operation.
///
/// \tparam ReduceByKeyConfig - configuration of device-level reduce-by-key operation.
/// Must be \p reduce_by_key_config or \p default_config.
/// \tparam SelectConfig - configuration of device-level select operation.
/// Must be \p select_config or \p default_config.
template<
class ReduceByKeyConfig,
class SelectConfig = default_config
>
struct run_length_encode_config
{
/// \brief Configuration of device-level reduce-by-key operation.
using reduce_by_key = ReduceByKeyConfig;
/// \brief Configuration of device-level select operation.
using select = SelectConfig;
};
namespace detail
{
using default_run_length_encode_config = run_length_encode_config<default_config, default_config>;
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_CONFIG_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
#define ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../functional.hpp"
#include "../type_traits.hpp"
#include "../types/future_value.hpp"
#include "../detail/various.hpp"
#include "device_scan_config.hpp"
#include "device_transform.hpp"
#include "detail/device_scan_common.hpp"
#include "detail/device_scan_lookback.hpp"
#include "detail/device_scan_reduce_then_scan.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace detail
{
// Single kernel scan (performs scan on one thread block only)
template<
bool Exclusive,
class Config,
class InputIterator,
class OutputIterator,
class BinaryFunction,
class InitValueType
>
ROCPRIM_KERNEL
__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
void single_scan_kernel(InputIterator input,
const size_t size,
const InitValueType initial_value,
OutputIterator output,
BinaryFunction scan_op)
{
single_scan_kernel_impl<Exclusive, Config>(
input, size, get_input_value(initial_value), output, scan_op
);
}
// Reduce-then-scan kernels
// Calculates block prefixes that will be used in final_scan_kernel
// when performing block scan operations.
template<
class Config,
class InputIterator,
class BinaryFunction,
class ResultType
>
ROCPRIM_KERNEL
__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
void block_reduce_kernel(InputIterator input,
BinaryFunction scan_op,
ResultType * block_prefixes)
{
block_reduce_kernel_impl<Config>(
input, scan_op, block_prefixes
);
}
template<
bool Exclusive,
class Config,
class InputIterator,
class OutputIterator,
class BinaryFunction,
class InitValueType
>
ROCPRIM_KERNEL
__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
void final_scan_kernel(InputIterator input,
const size_t size,
OutputIterator output,
const InitValueType initial_value,
BinaryFunction scan_op,
input_type_t<InitValueType>* block_prefixes,
input_type_t<InitValueType>* previous_last_element = nullptr,
input_type_t<InitValueType>* new_last_element = nullptr,
bool override_first_value = false,
bool save_last_value = false)
{
final_scan_kernel_impl<Exclusive, Config>(
input, size, output, get_input_value(initial_value),
scan_op, block_prefixes,
previous_last_element, new_last_element,
override_first_value, save_last_value
);
}
// Single pass (look-back kernels)
template<
bool Exclusive,
class Config,
class InputIterator,
class OutputIterator,
class BinaryFunction,
class InitValueType,
class LookBackScanState
>
ROCPRIM_KERNEL
__launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
void lookback_scan_kernel(InputIterator input,
OutputIterator output,
const size_t size,
const InitValueType initial_value,
BinaryFunction scan_op,
LookBackScanState lookback_scan_state,
const unsigned int number_of_blocks,
ordered_block_id<unsigned int> ordered_bid,
input_type_t<InitValueType>* previous_last_element = nullptr,
input_type_t<InitValueType>* new_last_element = nullptr,
bool override_first_value = false,
bool save_last_value = false)
{
lookback_scan_kernel_impl<Exclusive, Config>(
input, output, size, get_input_value(initial_value), scan_op,
lookback_scan_state, number_of_blocks, ordered_bid,
previous_last_element, new_last_element,
override_first_value, save_last_value
);
}
#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template<
bool Exclusive,
class Config,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction
>
inline
auto scan_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
BinaryFunction scan_op,
const cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<!Config::use_lookback, cudaError_t>::type
{
using config = Config;
using real_init_value_type = input_type_t<InitValueType>;
constexpr unsigned int block_size = config::block_size;
constexpr unsigned int items_per_thread = config::items_per_thread;
constexpr auto items_per_block = block_size * items_per_thread;
static constexpr size_t size_limit = config::size_limit;
static constexpr size_t aligned_size_limit = ::rocprim::max<size_t>(size_limit - size_limit % items_per_block, items_per_block);
size_t limited_size = std::min<size_t>(size, aligned_size_limit);
const bool use_limited_size = limited_size == aligned_size_limit;
size_t nested_prefixes_size_bytes = scan_get_temporary_storage_bytes<real_init_value_type>(limited_size, items_per_block);
// Calculate required temporary storage
if(temporary_storage == nullptr)
{
storage_size = nested_prefixes_size_bytes;
if(use_limited_size)
storage_size += 4 * sizeof(real_init_value_type);
// Make sure user won't try to allocate 0 bytes memory, because
// cudaMalloc will return nullptr when size is zero.
storage_size = storage_size == 0 ? 4 : storage_size;
return cudaSuccess;
}
// Start point for time measurements
std::chrono::high_resolution_clock::time_point start;
auto number_of_blocks = (size + items_per_block - 1)/items_per_block;
if( number_of_blocks == 0u )
return cudaSuccess;
if(number_of_blocks > 1)
{
unsigned int number_of_launch = (size + limited_size - 1)/limited_size;
for (size_t i = 0, offset = 0; i < number_of_launch; i++, offset+=limited_size )
{
size_t current_size = std::min<size_t>(size - offset, limited_size);
number_of_blocks = (current_size + items_per_block - 1)/items_per_block;
if(debug_synchronous)
{
std::cout << "use_limited_size " << use_limited_size << '\n';
std::cout << "number_of_launch " << number_of_launch << '\n';
std::cout << "inex " << i << '\n';
std::cout << "aligned_size_limit " << aligned_size_limit << '\n';
std::cout << "size " << current_size << '\n';
std::cout << "block_size " << block_size << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "items_per_block " << items_per_block << '\n';
std::cout.flush();
}
// Pointer to array with block_prefixes
char * ptr = reinterpret_cast<char *>(temporary_storage);
real_init_value_type* block_prefixes = reinterpret_cast<real_init_value_type*>(ptr);
real_init_value_type* previous_last_element = nullptr;
real_init_value_type* new_last_element = nullptr;
if(use_limited_size)
{
ptr += nested_prefixes_size_bytes;
previous_last_element = reinterpret_cast<real_init_value_type*>(ptr);
ptr += sizeof(real_init_value_type);
new_last_element = reinterpret_cast<real_init_value_type*>(ptr);
}
// Grid size for block_reduce_kernel, we don't need to calculate reduction
// of the last block as it will never be used as prefix for other blocks
auto grid_size = number_of_blocks - 1;
if( grid_size != 0 )
{
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
detail::block_reduce_kernel<
config, InputIterator, BinaryFunction, real_init_value_type
>
<<<dim3(grid_size), dim3(block_size), 0, stream>>>(
input + offset, scan_op, block_prefixes
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("block_reduce_kernel", current_size, start)
if( !Exclusive && i > 0 )
{
cudaError_t error = ::rocprim::transform(
previous_last_element, block_prefixes, block_prefixes, 1,
scan_op, stream, debug_synchronous
);
if(error != cudaSuccess) return error;
}
// TODO: Performance may increase if for (number_of_blocks < 8192) (or some other
// threshold) we would just use CPU to calculate prefixes.
// Calculate size of temporary storage for nested device scan operation
void * nested_temp_storage = static_cast<void*>(block_prefixes + number_of_blocks);
auto nested_temp_storage_size = storage_size - (number_of_blocks * sizeof(real_init_value_type));
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
auto error = scan_impl<false, config>(
nested_temp_storage,
nested_temp_storage_size,
block_prefixes, // input
block_prefixes, // output
real_init_value_type(), // dummy initial value
number_of_blocks, // input size
scan_op,
stream,
debug_synchronous
);
if(error != cudaSuccess) return error;
ROCPRIM_DETAIL_HIP_SYNC("nested_device_scan", number_of_blocks, start);
}
// Grid size for final_scan_kernel
grid_size = number_of_blocks;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
detail::final_scan_kernel<
Exclusive, // flag for exclusive scan operation
config, // kernel configuration (block size, ipt)
InputIterator, OutputIterator,
BinaryFunction, InitValueType
>
<<<dim3(grid_size), dim3(block_size), 0, stream>>>(
input + offset,
current_size,
output + offset,
initial_value,
scan_op,
block_prefixes,
previous_last_element,
new_last_element,
i != size_t(0) && ((!Exclusive && number_of_blocks == 1) || Exclusive),
number_of_launch > 1
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("final_scan_kernel", size, start);
// Swap the last_elements if it's necessary
if(number_of_launch > 1)
{
cudaError_t error = ::rocprim::transform(
new_last_element, previous_last_element, 1,
::rocprim::identity<real_init_value_type>(),
stream, debug_synchronous
);
if(error != cudaSuccess) return error;
}
}
}
else
{
if(debug_synchronous)
{
std::cout << "block_size " << block_size << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "items_per_block " << items_per_block << '\n';
}
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
detail::single_scan_kernel<
Exclusive, // flag for exclusive scan operation
config, // kernel configuration (block size, ipt)
InputIterator, OutputIterator, BinaryFunction
>
<<<dim3(1), dim3(block_size), 0, stream>>>(
input, size, initial_value, output, scan_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("single_scan_kernel", size, start);
}
return cudaSuccess;
}
template<
bool Exclusive,
class Config,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction
>
inline
auto scan_impl(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
BinaryFunction scan_op,
const cudaStream_t stream,
bool debug_synchronous)
-> typename std::enable_if<Config::use_lookback, cudaError_t>::type
{
using config = Config;
using real_init_value_type = input_type_t<InitValueType>;
using scan_state_type = detail::lookback_scan_state<real_init_value_type>;
using scan_state_with_sleep_type = detail::lookback_scan_state<real_init_value_type, true>;
using ordered_block_id_type = detail::ordered_block_id<unsigned int>;
constexpr unsigned int block_size = config::block_size;
constexpr unsigned int items_per_thread = config::items_per_thread;
constexpr auto items_per_block = block_size * items_per_thread;
static constexpr size_t size_limit = config::size_limit;
static constexpr size_t aligned_size_limit = ::rocprim::max<size_t>(size_limit - size_limit % items_per_block, items_per_block);
size_t limited_size = std::min<size_t>(size, aligned_size_limit);
const bool use_limited_size = limited_size == aligned_size_limit;
unsigned int number_of_blocks = (limited_size + items_per_block - 1)/items_per_block;
// Calculate required temporary storage
size_t scan_state_bytes = ::rocprim::detail::align_size(
// This is valid even with scan_state_with_sleep_type
scan_state_type::get_storage_size(number_of_blocks)
);
size_t ordered_block_id_bytes = ordered_block_id_type::get_storage_size();
if(temporary_storage == nullptr)
{
// storage_size is never zero
storage_size = scan_state_bytes + ordered_block_id_bytes;
if(use_limited_size)
storage_size += 2 * sizeof(real_init_value_type);
return cudaSuccess;
}
// Start point for time measurements
std::chrono::high_resolution_clock::time_point start;
if( number_of_blocks == 0u )
return cudaSuccess;
if(number_of_blocks > 1 || use_limited_size)
{
// Create and initialize lookback_scan_state obj
auto scan_state = scan_state_type::create(temporary_storage, number_of_blocks);
auto scan_state_with_sleep = scan_state_with_sleep_type::create(temporary_storage, number_of_blocks);
// Create ad initialize ordered_block_id obj
auto ptr = reinterpret_cast<char*>(temporary_storage);
auto ordered_bid = ordered_block_id_type::create(
reinterpret_cast<ordered_block_id_type::id_type*>(ptr + scan_state_bytes)
);
// The last element
real_init_value_type* previous_last_element = nullptr;
real_init_value_type* new_last_element = nullptr;
if(use_limited_size)
{
ptr += storage_size - sizeof(real_init_value_type);
new_last_element = reinterpret_cast<real_init_value_type*>(ptr);
ptr -= sizeof(real_init_value_type);
previous_last_element = reinterpret_cast<real_init_value_type*>(ptr);
}
cudaDeviceProp prop;
int deviceId;
static_cast<void>(cudaGetDevice(&deviceId));
static_cast<void>(cudaGetDeviceProperties(&prop, deviceId));
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
int asicRevision = 0;
size_t number_of_launch = (size + limited_size - 1)/limited_size;
for (size_t i = 0, offset = 0; i < number_of_launch; i++, offset+=limited_size )
{
size_t current_size = std::min<size_t>(size - offset, limited_size);
number_of_blocks = (current_size + items_per_block - 1)/items_per_block;
auto grid_size = (number_of_blocks + block_size - 1)/block_size;
if(debug_synchronous)
{
std::cout << "use_limited_size " << use_limited_size << '\n';
std::cout << "aligned_size_limit " << aligned_size_limit << '\n';
std::cout << "number_of_launch " << number_of_launch << '\n';
std::cout << "index " << i << '\n';
std::cout << "size " << current_size << '\n';
std::cout << "block_size " << block_size << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "items_per_block " << items_per_block << '\n';
}
init_lookback_scan_state_kernel<scan_state_type>
<<<dim3(grid_size), dim3(block_size), 0, stream>>>(
scan_state, number_of_blocks, ordered_bid
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_lookback_scan_state_kernel", number_of_blocks, start)
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
grid_size = number_of_blocks;
if(debug_synchronous)
{
std::cout << "use_limited_size " << use_limited_size << '\n';
std::cout << "aligned_size_limit " << aligned_size_limit << '\n';
std::cout << "size " << current_size << '\n';
std::cout << "block_size " << block_size << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "items_per_block " << items_per_block << '\n';
}
lookback_scan_kernel<
Exclusive, // flag for exclusive scan operation
config, // kernel configuration (block size, ipt)
InputIterator, OutputIterator,
BinaryFunction, InitValueType, scan_state_type
>
<<<dim3(grid_size), dim3(block_size), 0, stream>>>(
input + offset, output + offset, current_size, initial_value,
scan_op, scan_state, number_of_blocks, ordered_bid,
previous_last_element, new_last_element,
i != size_t(0), number_of_launch > 1
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("lookback_scan_kernel", current_size, start)
// Swap the last_elements
if(number_of_launch > 1)
{
cudaError_t error = ::rocprim::transform(
new_last_element, previous_last_element, 1,
::rocprim::identity<real_init_value_type>(),
stream, debug_synchronous
);
if(error != cudaSuccess) return error;
}
}
}
else
{
if(debug_synchronous)
{
std::cout << "size " << size << '\n';
std::cout << "block_size " << block_size << '\n';
std::cout << "number of blocks " << number_of_blocks << '\n';
std::cout << "items_per_block " << items_per_block << '\n';
}
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
single_scan_kernel<
Exclusive, // flag for exclusive scan operation
config, // kernel configuration (block size, ipt)
InputIterator, OutputIterator, BinaryFunction
>
<<<dim3(1), dim3(block_size), 0, stream>>>(
input, size, initial_value, output, scan_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("single_scan_kernel", size, start);
}
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#undef ROCPRIM_DETAIL_HIP_SYNC
} // end of detail namespace
/// \brief Parallel inclusive scan primitive for device level.
///
/// inclusive_scan function performs a device-wide inclusive prefix scan operation
/// using binary \p scan_op operator.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p output must have at least \p size elements.
/// * By default, the input type is used for accumulation. A custom type
/// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range. It can be
/// same as \p input.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level inclusive sum operation is performed on an array of
/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size, rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size, rocprim::plus<int>()
/// );
/// // output: [1, 3, 6, 10, 15, 21, 28, 36]
/// \endcode
///
/// The same example as above, but now a custom accumulator type is specified.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// size_t input_size;
/// short * input;
/// int * output;
///
/// // Use a transform iterator to specifiy a custom accumulator type
/// auto input_iterator = rocprim::make_transform_iterator(
/// input, [] __device__ (T in) { return static_cast<int>(in); });
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Use the transform iterator
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, input_size, rocprim::plus<int>()
/// );
///
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, input_size, rocprim::plus<int>()
/// );
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
>
inline
cudaError_t inclusive_scan(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const size_t size,
BinaryFunction scan_op = BinaryFunction(),
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIterator>::value_type;
// Get default config if Config is default_config
using config = detail::default_or_custom_config<
Config,
detail::default_scan_config<ROCPRIM_TARGET_ARCH, input_type>
>;
return detail::scan_impl<false, config>(
temporary_storage, storage_size,
// input_type() is a dummy initial value (not used)
input, output, input_type(), size,
scan_op, stream, debug_synchronous
);
}
/// \brief Parallel exclusive scan primitive for device level.
///
/// exclusive_scan function performs a device-wide exclusive prefix scan operation
/// using binary \p scan_op operator.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p output must have at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range. It can be
/// same as \p input.
/// \param [in] initial_value - initial value to start the scan.
/// A rocpim::future_value may be passed to use a value that will be later computed.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level exclusive min-scan operation is performed on an array of
/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom scan function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 8 elements
/// int start_value; // e.g., 9
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, start_value, input_size, min_op
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, start_value, input_size, min_op
/// );
/// // output: [9, 4, 7, 6, 2, 2, 1, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class InputIterator,
class OutputIterator,
class InitValueType,
class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>
>
inline
cudaError_t exclusive_scan(void * temporary_storage,
size_t& storage_size,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
BinaryFunction scan_op = BinaryFunction(),
const cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using real_init_value_type = detail::input_type_t<InitValueType>;
// Get default config if Config is default_config
using config = detail::default_or_custom_config<
Config,
detail::default_scan_config<ROCPRIM_TARGET_ARCH, real_init_value_type>
>;
return detail::scan_impl<true, config>(
temporary_storage, storage_size,
input, output, initial_value, size,
scan_op, stream, debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_
#define ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_
#include "detail/device_scan_by_key.hpp"
#include "detail/lookback_scan_state.hpp"
#include "detail/ordered_block_id.hpp"
#include "config_types.hpp"
#include "device_scan_by_key_config.hpp"
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../functional.hpp"
#include "../types/future_value.hpp"
#include "../types/tuple.hpp"
#include <cuda_runtime.h>
#include <iostream>
#include <iterator>
#include <type_traits>
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
template <bool Exclusive,
typename Config,
typename KeyInputIterator,
typename InputIterator,
typename OutputIterator,
typename InitialValueType,
typename CompareFunction,
typename BinaryFunction,
typename LookbackScanState,
typename ResultType>
void __global__ __launch_bounds__(Config::block_size) device_scan_by_key_kernel(
const KeyInputIterator keys,
const InputIterator values,
const OutputIterator output,
const InitialValueType initial_value,
const CompareFunction compare,
const BinaryFunction scan_op,
const LookbackScanState scan_state,
const size_t size,
const size_t starting_block,
const size_t number_of_blocks,
const ordered_block_id<unsigned int> ordered_bid,
const ::rocprim::tuple<ResultType, bool>* const previous_last_value)
{
device_scan_by_key_kernel_impl<Exclusive, Config>(keys,
values,
output,
get_input_value(initial_value),
compare,
scan_op,
scan_state,
size,
starting_block,
number_of_blocks,
ordered_bid,
previous_last_value);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
do \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) \
return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) \
return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
} while(false)
template <bool Exclusive,
typename Config,
typename KeysInputIterator,
typename InputIterator,
typename OutputIterator,
typename InitValueType,
typename BinaryFunction,
typename CompareFunction>
inline cudaError_t scan_by_key_impl(void* const temporary_storage,
size_t& storage_size,
KeysInputIterator keys,
InputIterator input,
OutputIterator output,
const InitValueType initial_value,
const size_t size,
const BinaryFunction scan_op,
const CompareFunction compare,
const cudaStream_t stream,
const bool debug_synchronous)
{
using config = Config;
using real_init_value_type = input_type_t<InitValueType>;
using wrapped_type = ::rocprim::tuple<real_init_value_type, bool>;
using scan_state_type = detail::lookback_scan_state<wrapped_type>;
using scan_state_with_sleep_type = detail::lookback_scan_state<wrapped_type, true>;
using ordered_block_id_type = detail::ordered_block_id<unsigned int>;
constexpr unsigned int block_size = config::block_size;
constexpr unsigned int items_per_thread = config::items_per_thread;
constexpr auto items_per_block = block_size * items_per_thread;
static constexpr size_t size_limit = config::size_limit;
static constexpr size_t aligned_size_limit
= ::rocprim::max<size_t>(size_limit - size_limit % items_per_block, items_per_block);
const size_t limited_size = std::min<size_t>(size, aligned_size_limit);
const bool use_limited_size = limited_size == aligned_size_limit;
// Number of blocks in a single launch (or the only launch if it fits)
const unsigned int number_of_blocks = ceiling_div(limited_size, items_per_block);
// Calculate required temporary storage, this is valid even with scan_state_with_sleep_type
const size_t scan_state_bytes
= align_size(scan_state_type::get_storage_size(number_of_blocks));
if(temporary_storage == nullptr)
{
const size_t ordered_block_id_bytes
= align_size(ordered_block_id_type::get_storage_size(), alignof(wrapped_type));
// storage_size is never zero
storage_size = scan_state_bytes + ordered_block_id_bytes
+ (use_limited_size ? sizeof(wrapped_type) : 0);
return cudaSuccess;
}
if(number_of_blocks == 0u)
{
return cudaSuccess;
}
bool use_sleep;
if(const cudaError_t error = is_sleep_scan_state_used(use_sleep))
{
return error;
}
// Call the provided function with either scan_state or scan_state_with_sleep based on
// the value of use_sleep_scan_state
auto with_scan_state
= [use_sleep,
scan_state = scan_state_type::create(temporary_storage, number_of_blocks),
scan_state_with_sleep = scan_state_with_sleep_type::create(
temporary_storage, number_of_blocks)](auto&& func) mutable -> decltype(auto) {
if(use_sleep)
{
return func(scan_state_with_sleep);
}
else
{
return func(scan_state);
}
};
// Create and initialize ordered_block_id obj
auto* const ptr = static_cast<char*>(temporary_storage);
const auto ordered_bid = ordered_block_id_type::create(
reinterpret_cast<ordered_block_id_type::id_type*>(ptr + scan_state_bytes));
// The last element
auto* const previous_last_value
= use_limited_size
? reinterpret_cast<wrapped_type*>(ptr + storage_size - sizeof(wrapped_type))
: nullptr;
// Total number of blocks in all launches
const auto total_number_of_blocks = ceiling_div(size, items_per_block);
const size_t number_of_launch = ceiling_div(size, limited_size);
if(debug_synchronous)
{
std::cout << "----------------------------------\n";
std::cout << "size: " << size << '\n';
std::cout << "aligned_size_limit: " << aligned_size_limit << '\n';
std::cout << "use_limited_size: " << std::boolalpha << use_limited_size << '\n';
std::cout << "number_of_launch: " << number_of_launch << '\n';
std::cout << "block_size: " << block_size << '\n';
std::cout << "items_per_block: " << items_per_block << '\n';
std::cout << "----------------------------------\n";
}
for(size_t i = 0, offset = 0; i < number_of_launch; i++, offset += limited_size)
{
const size_t current_size = std::min<size_t>(size - offset, limited_size);
const auto scan_blocks = ceiling_div(current_size, items_per_block);
const auto init_grid_size = ceiling_div(scan_blocks, block_size);
// Start point for time measurements
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous)
{
std::cout << "index: " << i << '\n';
std::cout << "current_size: " << current_size << '\n';
std::cout << "number of blocks: " << scan_blocks << '\n';
start = std::chrono::high_resolution_clock::now();
}
with_scan_state([&](const auto scan_state) {
init_lookback_scan_state_kernel<<<
dim3(init_grid_size),
dim3(block_size),
0,
stream>>>(
scan_state,
scan_blocks,
ordered_bid,
number_of_blocks - 1,
i > 0 ? previous_last_value : nullptr);
});
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"init_lookback_scan_state_kernel", scan_blocks, start);
if(debug_synchronous)
{
start = std::chrono::high_resolution_clock::now();
}
with_scan_state([&](auto& scan_state) {
device_scan_by_key_kernel<Exclusive, config><<<
dim3(scan_blocks),
dim3(block_size),
0,
stream>>>(
keys + offset,
input + offset,
output + offset,
initial_value,
compare,
scan_op,
scan_state,
size,
i * number_of_blocks,
total_number_of_blocks,
ordered_bid,
i > 0 ? previous_last_value : nullptr);
});
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"device_scan_by_key_kernel", current_size, start);
}
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
}
/// \addtogroup devicemodule
/// @{
/// \brief Parallel inclusive scan-by-key primitive for device level.
///
/// inclusive_scan_by_key function performs a device-wide inclusive prefix scan-by-key
/// operation using binary \p scan_op operator.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have
/// at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
/// \tparam KeyCompareFunction - type of binary function used to determine keys equality. Default type
/// is \p rocprim::equal_to<T>, where \p T is a \p value_type of \p KeysInputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - iterator to the first element in the range of keys.
/// \param [in] values_input - iterator to the first element in the range of values to scan.
/// \param [out] values_output - iterator to the first element in the output value range.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scanning
/// input values.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] key_compare_op - binary operation function object that will be used to determine keys equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is KeyCompareFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level inclusive sum-by-key operation is performed on an array of
/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// int * keys_input; // e.g., [1, 1, 2, 2, 3, 3, 3, 5]
/// short * values_input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::inclusive_scan_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input,
/// values_output, size,
/// rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan-by-key
/// rocprim::inclusive_scan_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input,
/// values_output, size,
/// rocprim::plus<int>()
/// );
/// // values_output: [1, 2, 3, 7, 5, 11, 18, 8]
/// \endcode
/// \endparblock
template <typename Config = default_config,
typename KeysInputIterator,
typename ValuesInputIterator,
typename ValuesOutputIterator,
typename BinaryFunction
= ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>,
typename KeyCompareFunction
= ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
inline cudaError_t inclusive_scan_by_key(void* const temporary_storage,
size_t& storage_size,
const KeysInputIterator keys_input,
const ValuesInputIterator values_input,
const ValuesOutputIterator values_output,
const size_t size,
const BinaryFunction scan_op = BinaryFunction(),
const KeyCompareFunction key_compare_op
= KeyCompareFunction(),
const cudaStream_t stream = 0,
const bool debug_synchronous = false)
{
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
// Get default config if Config is default_config
using config = detail::default_or_custom_config<
Config,
detail::default_scan_by_key_config<ROCPRIM_TARGET_ARCH, key_type, value_type>>;
return detail::scan_by_key_impl<false, config>(temporary_storage,
storage_size,
keys_input,
values_input,
values_output,
value_type(),
size,
scan_op,
key_compare_op,
stream,
debug_synchronous);
}
/// \brief Parallel exclusive scan-by-key primitive for device level.
///
/// inclusive_scan_by_key function performs a device-wide exclusive prefix scan-by-key
/// operation using binary \p scan_op operator.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have
/// at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
/// \tparam KeyCompareFunction - type of binary function used to determine keys equality. Default type
/// is \p rocprim::equal_to<T>, where \p T is a \p value_type of \p KeysInputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - iterator to the first element in the range of keys.
/// \param [in] values_input - iterator to the first element in the range of values to scan.
/// \param [out] values_output - iterator to the first element in the output value range.
/// \param [in] initial_value - initial value to start the scan.
/// A rocpim::future_value may be passed to use a value that will be later computed.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scanning
/// input values.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] key_compare_op - binary operation function object that will be used to determine keys equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is KeyCompareFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level inclusive sum-by-key operation is performed on an array of
/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// int * keys_input; // e.g., [1, 1, 1, 2, 2, 3, 3, 4]
/// short * values_input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int start_value; // e.g., 9
/// int * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::exclusive_scan_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input,
/// values_output, start_value,
/// size,rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan-by-key
/// rocprim::exclusive_scan_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input,
/// values_output, start_value,
/// size,rocprim::plus<int>()
/// );
/// // values_output: [9, 10, 12, 9, 13, 9, 15, 9]
/// \endcode
/// \endparblock
template <typename Config = default_config,
typename KeysInputIterator,
typename ValuesInputIterator,
typename ValuesOutputIterator,
typename InitialValueType,
typename BinaryFunction
= ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>,
typename KeyCompareFunction
= ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
inline cudaError_t exclusive_scan_by_key(void* const temporary_storage,
size_t& storage_size,
const KeysInputIterator keys_input,
const ValuesInputIterator values_input,
const ValuesOutputIterator values_output,
const InitialValueType initial_value,
const size_t size,
const BinaryFunction scan_op = BinaryFunction(),
const KeyCompareFunction key_compare_op
= KeyCompareFunction(),
const cudaStream_t stream = 0,
const bool debug_synchronous = false)
{
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using real_init_value_type = detail::input_type_t<InitialValueType>;
// Get default config if Config is default_config
using config = detail::default_or_custom_config<
Config,
detail::default_scan_by_key_config<ROCPRIM_TARGET_ARCH, key_type, real_init_value_type>
>;
return detail::scan_by_key_impl<true, config>(temporary_storage,
storage_size,
keys_input,
values_input,
values_output,
initial_value,
size,
scan_op,
key_compare_op,
stream,
debug_synchronous);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level scan-by-key operation.
///
/// \tparam BlockSize - number of threads in a block.
/// \tparam ItemsPerThread - number of items processed by each thread.
/// \tparam UseLookback - whether to use lookback scan or reduce-then-scan algorithm.
/// \tparam BlockLoadMethod - method for loading input values.
/// \tparam StoreLoadMethod - method for storing values.
/// \tparam BlockScanMethod - algorithm for block scan.
/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch.
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool UseLookback,
::rocprim::block_load_method BlockLoadMethod,
::rocprim::block_store_method BlockStoreMethod,
::rocprim::block_scan_algorithm BlockScanMethod,
unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT
>
struct scan_by_key_config
{
/// \brief Number of threads in a block.
static constexpr unsigned int block_size = BlockSize;
/// \brief Number of items processed by each thread.
static constexpr unsigned int items_per_thread = ItemsPerThread;
/// \brief Whether to use lookback scan or reduce-then-scan algorithm.
static constexpr bool use_lookback = UseLookback;
/// \brief Method for loading input values.
static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod;
/// \brief Method for storing values.
static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod;
/// \brief Algorithm for block scan.
static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
/// \brief Limit on the number of items for a single scan kernel launch.
static constexpr unsigned int size_limit = SizeLimit;
};
namespace detail
{
template<class Key, class Value>
struct scan_by_key_config_900
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
using type = scan_config<
limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 16u / item_scale),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_store_method::block_store_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<class Key, class Value>
struct scan_by_key_config_90a
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
using type = scan_config<
limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 16u / item_scale),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_store_method::block_store_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<class Key, class Value>
struct scan_by_key_config_908
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
using type = scan_config<
limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 20u / item_scale),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_store_method::block_store_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
// TODO: We need to update these parameters
template<class Key, class Value>
struct scan_by_key_config_1030
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Key) + sizeof(Value), 2 * sizeof(int));
using type = scan_config<
limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
::rocprim::max(1u, 15u / item_scale),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_store_method::block_store_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<unsigned int TargetArch, class Key, class Value>
struct default_scan_by_key_config
: select_arch<
TargetArch,
select_arch_case<900, scan_by_key_config_900<Key, Value>>,
select_arch_case<ROCPRIM_ARCH_90a, scan_by_key_config_90a<Key, Value>>,
select_arch_case<908, scan_by_key_config_908<Key, Value>>,
select_arch_case<1030, scan_by_key_config_1030<Key, Value>>,
scan_by_key_config_900<Key, Value>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_CONFIG_HPP_
// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_SCAN_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../block/block_load.hpp"
#include "../block/block_store.hpp"
#include "../block/block_scan.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level scan primitives.
///
/// \tparam BlockSize - number of threads in a block.
/// \tparam ItemsPerThread - number of items processed by each thread.
/// \tparam UseLookback - whether to use lookback scan or reduce-then-scan algorithm.
/// \tparam BlockLoadMethod - method for loading input values.
/// \tparam StoreLoadMethod - method for storing values.
/// \tparam BlockScanMethod - algorithm for block scan.
/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch.
template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool UseLookback,
::rocprim::block_load_method BlockLoadMethod,
::rocprim::block_store_method BlockStoreMethod,
::rocprim::block_scan_algorithm BlockScanMethod,
unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT
>
struct scan_config
{
/// \brief Number of threads in a block.
static constexpr unsigned int block_size = BlockSize;
/// \brief Number of items processed by each thread.
static constexpr unsigned int items_per_thread = ItemsPerThread;
/// \brief Whether to use lookback scan or reduce-then-scan algorithm.
static constexpr bool use_lookback = UseLookback;
/// \brief Method for loading input values.
static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod;
/// \brief Method for storing values.
static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod;
/// \brief Algorithm for block scan.
static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
/// \brief Limit on the number of items for a single scan kernel launch.
static constexpr unsigned int size_limit = SizeLimit;
};
namespace detail
{
template<class Value>
struct scan_config_803
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = scan_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 16u / item_scale),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_store_method::block_store_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<class Value>
struct scan_config_900
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = scan_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 16u / item_scale),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_store_method::block_store_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
// TODO: We need to update these parameters
template<class Value>
struct scan_config_90a
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = scan_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 16u / item_scale),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_store_method::block_store_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<class Value>
struct scan_config_908
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = scan_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
::rocprim::max(1u, 20u / item_scale),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_store_method::block_store_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
// TODO: We need to update these parameters
template<class Value>
struct scan_config_1030
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
using type = scan_config<
limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
::rocprim::max(1u, 15u / item_scale),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
::rocprim::block_load_method::block_load_transpose,
::rocprim::block_store_method::block_store_transpose,
::rocprim::block_scan_algorithm::using_warp_scan
>;
};
template<unsigned int TargetArch, class Value>
struct default_scan_config
: select_arch<
TargetArch,
select_arch_case<803, scan_config_803<Value>>,
select_arch_case<900, scan_config_900<Value>>,
select_arch_case<ROCPRIM_ARCH_90a, scan_config_90a<Value>>,
select_arch_case<908, scan_config_908<Value>>,
select_arch_case<1030, scan_config_1030<Value>>,
scan_config_900<Value>
> { };
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_SCAN_CONFIG_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
#include <iostream>
#include <iterator>
#include <type_traits>
#include <utility>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/radix_sort.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "../block/block_load.hpp"
#include "../iterator/counting_iterator.hpp"
#include "../iterator/reverse_iterator.hpp"
#include "detail/device_segmented_radix_sort.hpp"
#include "device_partition.hpp"
#include "device_segmented_radix_sort_config.hpp"
/// \addtogroup devicemodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace detail
{
template<
class Config,
bool Descending,
unsigned int BlockSize,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class OffsetIterator
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void segmented_sort_kernel(KeysInputIterator keys_input,
typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
ValuesOutputIterator values_output,
bool to_output,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int long_iterations,
unsigned int short_iterations,
unsigned int begin_bit,
unsigned int end_bit)
{
segmented_sort<Config, Descending>(
keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
to_output,
begin_offsets, end_offsets,
long_iterations, short_iterations,
begin_bit, end_bit
);
}
template<
class Config,
bool Descending,
unsigned int BlockSize,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class SegmentIndexIterator,
class OffsetIterator
>
ROCPRIM_KERNEL
__launch_bounds__(BlockSize)
void segmented_sort_large_kernel(KeysInputIterator keys_input,
typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
ValuesOutputIterator values_output,
bool to_output,
SegmentIndexIterator segment_indices,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int long_iterations,
unsigned int short_iterations,
unsigned int begin_bit,
unsigned int end_bit)
{
segmented_sort_large<Config, Descending>(
keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
to_output, segment_indices,
begin_offsets, end_offsets,
long_iterations, short_iterations,
begin_bit, end_bit
);
}
template<class Config,
bool Descending,
unsigned int BlockSize,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class SegmentIndexIterator,
class OffsetIterator>
ROCPRIM_KERNEL __launch_bounds__(BlockSize) void segmented_sort_small_or_medium_kernel(
KeysInputIterator keys_input,
typename std::iterator_traits<KeysInputIterator>::value_type* keys_tmp,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
typename std::iterator_traits<ValuesInputIterator>::value_type* values_tmp,
ValuesOutputIterator values_output,
bool to_output,
unsigned int num_segments,
SegmentIndexIterator segment_indices,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit,
unsigned int end_bit)
{
segmented_sort_small<Config, Descending>(
keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
to_output, num_segments, segment_indices,
begin_offsets, end_offsets,
begin_bit, end_bit
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
struct TwoWayPartitioner
{
template<typename InputIterator,
typename FirstOutputIterator,
typename SecondOutputIterator,
typename UnselectedOutputIterator,
typename SelectedCountOutputIterator,
typename FirstUnaryPredicate,
typename SecondUnaryPredicate>
cudaError_t operator()(void* temporary_storage,
size_t& storage_size,
InputIterator input,
FirstOutputIterator output_first_part,
SecondOutputIterator /*output_second_part*/,
UnselectedOutputIterator /*output_unselected*/,
SelectedCountOutputIterator selected_count_output,
const size_t size,
FirstUnaryPredicate select_first_part_op,
SecondUnaryPredicate /*select_second_part_op*/,
const cudaStream_t stream,
const bool debug_synchronous)
{
return partition(temporary_storage,
storage_size,
input,
output_first_part,
selected_count_output,
size,
select_first_part_op,
stream,
debug_synchronous);
}
};
struct ThreeWayPartitioner
{
template<typename InputIterator,
typename FirstOutputIterator,
typename SecondOutputIterator,
typename UnselectedOutputIterator,
typename SelectedCountOutputIterator,
typename FirstUnaryPredicate,
typename SecondUnaryPredicate>
cudaError_t operator()(void* temporary_storage,
size_t& storage_size,
InputIterator input,
FirstOutputIterator output_first_part,
SecondOutputIterator output_second_part,
UnselectedOutputIterator output_unselected,
SelectedCountOutputIterator selected_count_output,
const size_t size,
FirstUnaryPredicate select_first_part_op,
SecondUnaryPredicate select_second_part_op,
const cudaStream_t stream,
const bool debug_synchronous)
{
return partition_three_way(temporary_storage,
storage_size,
input,
output_first_part,
output_second_part,
output_unselected,
selected_count_output,
size,
select_first_part_op,
select_second_part_op,
stream,
debug_synchronous);
}
};
template<
class Config,
bool Descending,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class OffsetIterator
>
inline
cudaError_t segmented_radix_sort_impl(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
ValuesOutputIterator values_output,
unsigned int size,
bool& is_result_in_output,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit,
unsigned int end_bit,
cudaStream_t stream,
bool debug_synchronous)
{
using key_type = typename std::iterator_traits<KeysInputIterator>::value_type;
using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
using segment_index_type = unsigned int;
using segment_index_iterator = counting_iterator<segment_index_type>;
static_assert(
std::is_same<key_type, typename std::iterator_traits<KeysOutputIterator>::value_type>::value,
"KeysInputIterator and KeysOutputIterator must have the same value_type"
);
static_assert(
std::is_same<value_type, typename std::iterator_traits<ValuesOutputIterator>::value_type>::value,
"ValuesInputIterator and ValuesOutputIterator must have the same value_type"
);
using config = default_or_custom_config<
Config,
default_segmented_radix_sort_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
>;
static constexpr bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
static constexpr bool partitioning_allowed =
!std::is_same<typename config::warp_sort_config, DisabledWarpSortConfig>::value;
static constexpr unsigned int max_small_segment_length
= config::warp_sort_config::items_per_thread_small
* config::warp_sort_config::logical_warp_size_small;
static constexpr unsigned int small_segments_per_block
= config::warp_sort_config::block_size_small
/ config::warp_sort_config::logical_warp_size_small;
static constexpr unsigned int max_medium_segment_length
= config::warp_sort_config::items_per_thread_medium
* config::warp_sort_config::logical_warp_size_medium;
static constexpr unsigned int medium_segments_per_block
= config::warp_sort_config::block_size_medium
/ config::warp_sort_config::logical_warp_size_medium;
static_assert(
max_small_segment_length <= max_medium_segment_length,
"The max length of small segments cannot be higher than the max length of medium segments");
// Don't waste cycles on 3-way partitioning, if the small and medium segments are equal length
static constexpr bool three_way_partitioning
= max_small_segment_length < max_medium_segment_length;
using partitioner_type
= std::conditional_t<three_way_partitioning, ThreeWayPartitioner, TwoWayPartitioner>;
partitioner_type partitioner;
const auto large_segment_selector = [=](const unsigned int segment_index) mutable -> bool
{
const unsigned int segment_length
= end_offsets[segment_index] - begin_offsets[segment_index];
return segment_length > max_medium_segment_length;
};
const auto medium_segment_selector = [=](const unsigned int segment_index) mutable -> bool
{
const unsigned int segment_length = end_offsets[segment_index] - begin_offsets[segment_index];
return segment_length > max_small_segment_length;
};
const bool with_double_buffer = keys_tmp != nullptr;
const unsigned int bits = end_bit - begin_bit;
const unsigned int iterations = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits);
const bool to_output = with_double_buffer || (iterations - 1) % 2 == 0;
is_result_in_output = (iterations % 2 == 0) != to_output;
const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits;
const unsigned int short_iterations = radix_bits_diff != 0
? ::rocprim::min(iterations, (config::long_radix_bits * iterations - bits) / radix_bits_diff)
: 0;
const unsigned int long_iterations = iterations - short_iterations;
const bool do_partitioning = partitioning_allowed
&& segments >= config::warp_sort_config::partitioning_threshold;
const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type));
const size_t values_bytes = with_values ? ::rocprim::detail::align_size(size * sizeof(value_type)) : 0;
const size_t large_and_small_segment_indices_bytes
= ::rocprim::detail::align_size(segments * sizeof(segment_index_type));
const size_t medium_segment_indices_bytes
= three_way_partitioning
? ::rocprim::detail::align_size(segments * sizeof(segment_index_type))
: 0;
static constexpr size_t segment_count_output_size = three_way_partitioning ? 2 : 1;
const size_t segment_count_output_bytes
= ::rocprim::detail::align_size(segment_count_output_size * sizeof(segment_index_type));
segment_index_type* large_segment_indices_output{};
// The total number of large and small segments is not above the number of segments
// The same buffer is filled with the large and small indices from both directions
auto small_segment_indices_output
= make_reverse_iterator(large_segment_indices_output + segments);
segment_index_type* medium_segment_indices_output{};
segment_index_type* segment_count_output{};
size_t partition_storage_size{};
void* partition_temporary_storage{};
if(temporary_storage == nullptr)
{
storage_size = with_double_buffer ? 0 : (keys_bytes + values_bytes);
if(do_partitioning)
{
storage_size += large_and_small_segment_indices_bytes;
storage_size += medium_segment_indices_bytes;
storage_size += segment_count_output_bytes;
const auto partition_result = partitioner(partition_temporary_storage,
partition_storage_size,
segment_index_iterator{},
large_segment_indices_output,
medium_segment_indices_output,
small_segment_indices_output,
segment_count_output,
segments,
large_segment_selector,
medium_segment_selector,
stream,
debug_synchronous);
if(cudaSuccess != partition_result)
{
return partition_result;
}
storage_size += partition_storage_size;
}
// Make sure user won't try to allocate 0 bytes memory, otherwise
// user may again pass nullptr as temporary_storage
storage_size = storage_size == 0 ? 4 : storage_size;
return cudaSuccess;
}
if(segments == 0u)
{
return cudaSuccess;
}
if(debug_synchronous)
{
std::cout << "begin_bit " << begin_bit << '\n';
std::cout << "end_bit " << end_bit << '\n';
std::cout << "bits " << bits << '\n';
std::cout << "segments " << segments << '\n';
std::cout << "radix_bits_diff " << radix_bits_diff << '\n';
std::cout << "storage_size " << storage_size << '\n';
std::cout << "iterations " << iterations << '\n';
std::cout << "long_iterations " << long_iterations << '\n';
std::cout << "short_iterations " << short_iterations << '\n';
std::cout << "do_partitioning " << do_partitioning << '\n';
std::cout << "config::sort::block_size: " << config::sort::block_size << '\n';
std::cout << "config::sort::items_per_thread: " << config::sort::items_per_thread << '\n';
cudaError_t error = cudaStreamSynchronize(stream);
if(error != cudaSuccess) return error;
}
char* ptr = reinterpret_cast<char*>(temporary_storage);
if(!with_double_buffer)
{
keys_tmp = reinterpret_cast<key_type*>(ptr);
ptr += keys_bytes;
values_tmp = with_values ? reinterpret_cast<value_type*>(ptr) : nullptr;
ptr += values_bytes;
}
large_segment_indices_output = reinterpret_cast<segment_index_type*>(ptr);
ptr += large_and_small_segment_indices_bytes;
medium_segment_indices_output = reinterpret_cast<segment_index_type*>(ptr);
ptr += medium_segment_indices_bytes;
small_segment_indices_output = make_reverse_iterator(large_segment_indices_output + segments);
segment_count_output = reinterpret_cast<segment_index_type*>(ptr);
ptr += segment_count_output_bytes;
partition_temporary_storage = ptr;
ptr += partition_storage_size;
if(do_partitioning)
{
cudaError_t result = partitioner(partition_temporary_storage,
partition_storage_size,
segment_index_iterator{},
large_segment_indices_output,
medium_segment_indices_output,
small_segment_indices_output,
segment_count_output,
segments,
large_segment_selector,
medium_segment_selector,
stream,
debug_synchronous);
if(cudaSuccess != result)
{
return result;
}
segment_index_type segment_counts[segment_count_output_size]{};
result = cudaMemcpyAsync(&segment_counts,
segment_count_output,
segment_count_output_bytes,
cudaMemcpyDeviceToHost,
stream);
if(cudaSuccess != result)
{
return result;
}
result = cudaStreamSynchronize(stream);
if(cudaSuccess != result)
{
return result;
}
const auto large_segment_count = segment_counts[0];
const auto medium_segment_count = three_way_partitioning ? segment_counts[1] : 0;
const auto small_segment_count = segments - large_segment_count - medium_segment_count;
if(debug_synchronous)
{
std::cout << "large_segment_count " << large_segment_count << '\n';
std::cout << "medium_segment_count " << medium_segment_count << '\n';
std::cout << "small_segment_count " << small_segment_count << '\n';
}
if(large_segment_count > 0)
{
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
segmented_sort_large_kernel<config, Descending, config::sort::block_size>
<<<dim3(large_segment_count), dim3(config::sort::block_size), 0, stream>>>(
keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
to_output, large_segment_indices_output,
begin_offsets, end_offsets,
long_iterations, short_iterations,
begin_bit, end_bit
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort:large_segments",
large_segment_count,
start)
}
if(three_way_partitioning && medium_segment_count > 0)
{
const auto medium_segment_grid_size
= ::rocprim::detail::ceiling_div(medium_segment_count, medium_segments_per_block);
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous)
start = std::chrono::high_resolution_clock::now();
segmented_sort_small_or_medium_kernel<
select_warp_sort_helper_config_medium_t<typename config::warp_sort_config>,
Descending,
config::warp_sort_config::block_size_medium>
<<<dim3(medium_segment_grid_size),
dim3(config::warp_sort_config::block_size_medium),
0,
stream>>>(
keys_input,
keys_tmp,
keys_output,
values_input,
values_tmp,
values_output,
is_result_in_output,
medium_segment_count,
medium_segment_indices_output,
begin_offsets,
end_offsets,
begin_bit,
end_bit);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort:medium_segments",
medium_segment_count,
start)
}
if(small_segment_count > 0)
{
const auto small_segment_grid_size = ::rocprim::detail::ceiling_div(small_segment_count,
small_segments_per_block);
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
segmented_sort_small_or_medium_kernel<
select_warp_sort_helper_config_small_t<typename config::warp_sort_config>,
Descending,
config::warp_sort_config::block_size_small>
<<<dim3(small_segment_grid_size),
dim3(config::warp_sort_config::block_size_small),
0,
stream>>>(
keys_input,
keys_tmp,
keys_output,
values_input,
values_tmp,
values_output,
is_result_in_output,
small_segment_count,
small_segment_indices_output,
begin_offsets,
end_offsets,
begin_bit,
end_bit);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort:small_segments",
small_segment_count,
start)
}
}
else
{
std::chrono::high_resolution_clock::time_point start;
if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
segmented_sort_kernel<config, Descending, config::sort::block_size>
<<<dim3(segments), dim3(config::sort::block_size), 0, stream>>>(
keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
to_output,
begin_offsets, end_offsets,
long_iterations, short_iterations,
begin_bit, end_bit
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("segmented_sort", segments, start)
}
return cudaSuccess;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
} // end namespace detail
/// \brief Parallel ascending radix sort primitive for device level.
///
/// \p segmented_radix_sort_keys function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of keys. Function sorts input keys in ascending order.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * output; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys_output: [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class OffsetIterator,
class Key = typename std::iterator_traits<KeysInputIterator>::value_type
>
inline
cudaError_t segmented_radix_sort_keys(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
unsigned int size,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
empty_type * values = nullptr;
bool ignored;
return detail::segmented_radix_sort_impl<Config, false>(
temporary_storage, storage_size,
keys_input, nullptr, keys_output,
values, nullptr, values,
size, ignored,
segments, begin_offsets, end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
/// \brief Parallel descending radix sort primitive for device level.
///
/// \p segmented_radix_sort_keys_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of keys. Function sorts input keys in descending order.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
/// int * output; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys_output: [6, 3, 5, 8, 7, 4, 2, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class OffsetIterator,
class Key = typename std::iterator_traits<KeysInputIterator>::value_type
>
inline
cudaError_t segmented_radix_sort_keys_desc(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
unsigned int size,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
empty_type * values = nullptr;
bool ignored;
return detail::segmented_radix_sort_impl<Config, true>(
temporary_storage, storage_size,
keys_input, nullptr, keys_output,
values, nullptr, values,
size, ignored,
segments, begin_offsets, end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
/// \brief Parallel ascending radix sort-by-key primitive for device level.
///
/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in ascending order of keys.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
/// have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
///
/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
/// // is set to 5.
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output, input_size,
/// segments, offsets, offsets + 1,
/// 0, 5
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output, input_size,
/// segments, offsets, offsets + 1,
/// 0, 5
/// );
/// // keys_output: [3, 6, 5, 1, 1, 4, 7, 8]
/// // values_output: [2, -5, -4, -1, -2, 3, 7, -8]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class OffsetIterator,
class Key = typename std::iterator_traits<KeysInputIterator>::value_type
>
inline
cudaError_t segmented_radix_sort_pairs(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
bool ignored;
return detail::segmented_radix_sort_impl<Config, false>(
temporary_storage, storage_size,
keys_input, nullptr, keys_output,
values_input, nullptr, values_output,
size, ignored,
segments, begin_offsets, end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
/// \brief Parallel descending radix sort-by-key primitive for device level.
///
/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in descending order of keys.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
/// have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed where input keys are
/// represented by an array of integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys_output: [ 6, 3, 5, 8, 7, 4, 1, 1]
/// // values_output: [-5, 2, -4, -8, 7, 3, -1, -2]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class KeysInputIterator,
class KeysOutputIterator,
class ValuesInputIterator,
class ValuesOutputIterator,
class OffsetIterator,
class Key = typename std::iterator_traits<KeysInputIterator>::value_type
>
inline
cudaError_t segmented_radix_sort_pairs_desc(void * temporary_storage,
size_t& storage_size,
KeysInputIterator keys_input,
KeysOutputIterator keys_output,
ValuesInputIterator values_input,
ValuesOutputIterator values_output,
unsigned int size,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
bool ignored;
return detail::segmented_radix_sort_impl<Config, true>(
temporary_storage, storage_size,
keys_input, nullptr, keys_output,
values_input, nullptr, values_output,
size, ignored,
segments, begin_offsets, end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
/// \brief Parallel ascending radix sort primitive for device level.
///
/// \p segmented_radix_sort_keys function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of keys. Function sorts input keys in ascending order.
///
/// \par Overview
/// * The contents of both buffers of \p keys may be altered by the sorting function.
/// * \p current() of \p keys is used as the input.
/// * The function will update \p current() of \p keys to point to the buffer
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * tmp; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
/// // Create double-buffer
/// rocprim::double_buffer<float> keys(input, tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys.current(): [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class Key,
class OffsetIterator
>
inline
cudaError_t segmented_radix_sort_keys(void * temporary_storage,
size_t& storage_size,
double_buffer<Key>& keys,
unsigned int size,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
empty_type * values = nullptr;
bool is_result_in_output;
cudaError_t error = detail::segmented_radix_sort_impl<Config, false>(
temporary_storage, storage_size,
keys.current(), keys.current(), keys.alternate(),
values, values, values,
size, is_result_in_output,
segments, begin_offsets, end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
if(temporary_storage != nullptr && is_result_in_output)
{
keys.swap();
}
return error;
}
/// \brief Parallel descending radix sort primitive for device level.
///
/// \p segmented_radix_sort_keys_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of keys. Function sorts input keys in descending order.
///
/// \par Overview
/// * The contents of both buffers of \p keys may be altered by the sorting function.
/// * \p current() of \p keys is used as the input.
/// * The function will update \p current() of \p keys to point to the buffer
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
/// int * tmp; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
/// // Create double-buffer
/// rocprim::double_buffer<int> keys(input, tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys.current(): [6, 3, 5, 8, 7, 4, 2, 1]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class Key,
class OffsetIterator
>
inline
cudaError_t segmented_radix_sort_keys_desc(void * temporary_storage,
size_t& storage_size,
double_buffer<Key>& keys,
unsigned int size,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
empty_type * values = nullptr;
bool is_result_in_output;
cudaError_t error = detail::segmented_radix_sort_impl<Config, true>(
temporary_storage, storage_size,
keys.current(), keys.current(), keys.alternate(),
values, values, values,
size, is_result_in_output,
segments, begin_offsets, end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
if(temporary_storage != nullptr && is_result_in_output)
{
keys.swap();
}
return error;
}
/// \brief Parallel ascending radix sort-by-key primitive for device level.
///
/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in ascending order of keys.
///
/// \par Overview
/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
/// * \p current() of \p keys and \p values are used as the input.
/// * The function will update \p current() of \p keys and \p values to point to buffers
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Value - value type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in,out] values - reference to the double-buffer of values, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_tmp; // empty array of 8 elements
/// double* values_tmp; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
/// // Create double-buffers
/// rocprim::double_buffer<unsigned int> keys(keys_input, keys_tmp);
/// rocprim::double_buffer<double> values(values_input, values_tmp);
///
/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
/// // is set to 5.
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// segments, offsets, offsets + 1
/// 0, 5
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// segments, offsets, offsets + 1
/// 0, 5
/// );
/// // keys.current(): [3, 6, 5, 1, 1, 4, 7, 8]
/// // values.current(): [2, -5, -4, -1, -2, 3, 7, -8]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class Key,
class Value,
class OffsetIterator
>
inline
cudaError_t segmented_radix_sort_pairs(void * temporary_storage,
size_t& storage_size,
double_buffer<Key>& keys,
double_buffer<Value>& values,
unsigned int size,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
bool is_result_in_output;
cudaError_t error = detail::segmented_radix_sort_impl<Config, false>(
temporary_storage, storage_size,
keys.current(), keys.current(), keys.alternate(),
values.current(), values.current(), values.alternate(),
size, is_result_in_output,
segments, begin_offsets, end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
if(temporary_storage != nullptr && is_result_in_output)
{
keys.swap();
values.swap();
}
return error;
}
/// \brief Parallel descending radix sort-by-key primitive for device level.
///
/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in descending order of keys.
///
/// \par Overview
/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
/// * \p current() of \p keys and \p values are used as the input.
/// * The function will update \p current() of \p keys and \p values to point to buffers
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Value - value type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in,out] values - reference to the double-buffer of values, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed where input keys are
/// represented by an array of integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// int * keys_tmp; // empty array of 8 elements
/// double * values_tmp; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
/// // Create double-buffers
/// rocprim::double_buffer<int> keys(keys_input, keys_tmp);
/// rocprim::double_buffer<double> values(values_input, values_tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys.current(): [ 6, 3, 5, 8, 7, 4, 1, 1]
/// // values.current(): [-5, 2, -4, -8, 7, 3, -1, -2]
/// \endcode
/// \endparblock
template<
class Config = default_config,
class Key,
class Value,
class OffsetIterator
>
inline
cudaError_t segmented_radix_sort_pairs_desc(void * temporary_storage,
size_t& storage_size,
double_buffer<Key>& keys,
double_buffer<Value>& values,
unsigned int size,
unsigned int segments,
OffsetIterator begin_offsets,
OffsetIterator end_offsets,
unsigned int begin_bit = 0,
unsigned int end_bit = 8 * sizeof(Key),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
bool is_result_in_output;
cudaError_t error = detail::segmented_radix_sort_impl<Config, true>(
temporary_storage, storage_size,
keys.current(), keys.current(), keys.alternate(),
values.current(), values.current(), values.alternate(),
size, is_result_in_output,
segments, begin_offsets, end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
if(temporary_storage != nullptr && is_result_in_output)
{
keys.swap();
values.swap();
}
return error;
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group devicemodule
#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
// Copyright (c) 2018-2020 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_CONFIG_HPP_
#include <algorithm>
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of the warp sort part of the device segmented radix sort operation.
/// Short enough segments are processed on warp level.
///
/// \tparam LogicalWarpSizeSmall - number of threads in the logical warp of the kernel
/// that processes small segments.
/// \tparam ItemsPerThreadSmall - number of items processed by a thread in the kernel that processes
/// small segments.
/// \tparam BlockSizeSmall - number of threads per block in the kernel which processes the small segments.
/// \tparam PartitioningThreshold - if the number of segments is at least this threshold, the
/// segments are partitioned to a small, a medium and a large segment collection. Both collections
/// are sorted by different kernels. Otherwise, all segments are sorted by a single kernel.
/// \tparam EnableUnpartitionedWarpSort - If set to \p true, warp sort can be used to sort
/// the small segments, even if the total number of segments is below \p PartitioningThreshold.
/// \tparam LogicalWarpSizeMedium - number of threads in the logical warp of the kernel
/// that processes medium segments.
/// \tparam ItemsPerThreadMedium - number of items processed by a thread in the kernel that processes
/// medium segments.
/// \tparam BlockSizeMedium - number of threads per block in the kernel which processes the medium segments.
template<unsigned int LogicalWarpSizeSmall,
unsigned int ItemsPerThreadSmall,
unsigned int BlockSizeSmall = 256,
unsigned int PartitioningThreshold = 3000,
bool EnableUnpartitionedWarpSort = true,
unsigned int LogicalWarpSizeMedium = std::max(32u, LogicalWarpSizeSmall),
unsigned int ItemsPerThreadMedium = std::max(4u, ItemsPerThreadSmall),
unsigned int BlockSizeMedium = 256>
struct WarpSortConfig
{
static_assert(LogicalWarpSizeSmall * ItemsPerThreadSmall
<= LogicalWarpSizeMedium * ItemsPerThreadMedium,
"The number of items processed by a small warp cannot be larger than the number "
"of items processed by a medium warp");
/// \brief The number of threads in the logical warp in the small segment processing kernel.
static constexpr unsigned int logical_warp_size_small = LogicalWarpSizeSmall;
/// \brief The number of items processed by a thread in the small segment processing kernel.
static constexpr unsigned int items_per_thread_small = ItemsPerThreadSmall;
/// \brief The number of threads per block in the small segment processing kernel.
static constexpr unsigned int block_size_small = BlockSizeSmall;
/// \brief If the number of segments is at least \p partitioning_threshold, then the segments are partitioned into
/// small and large segment groups, and each group is handled by a different, specialized kernel.
static constexpr unsigned int partitioning_threshold = PartitioningThreshold;
/// \brief If set to \p true, warp sort can be used to sort the small segments, even if the total number of
/// segments is below \p PartitioningThreshold.
static constexpr bool enable_unpartitioned_warp_sort = EnableUnpartitionedWarpSort;
/// \brief The number of threads in the logical warp in the medium segment processing kernel.
static constexpr unsigned int logical_warp_size_medium = LogicalWarpSizeMedium;
/// \brief The number of items processed by a thread in the medium segment processing kernel.
static constexpr unsigned int items_per_thread_medium = ItemsPerThreadMedium;
/// \brief The number of threads per block in the medium segment processing kernel.
static constexpr unsigned int block_size_medium = BlockSizeMedium;
};
/// \brief Indicates if the warp level sorting is disabled in the
/// device segmented radix sort configuration.
struct DisabledWarpSortConfig
{
/// \brief The number of threads in the logical warp in the small segment processing kernel.
static constexpr unsigned int logical_warp_size_small = 1;
/// \brief The number of items processed by a thread in the small segment processing kernel.
static constexpr unsigned int items_per_thread_small = 1;
/// \brief The number of threads per block in the small segment processing kernel.
static constexpr unsigned int block_size_small = 1;
/// \brief If the number of segments is at least \p partitioning_threshold, then the segments are partitioned into
/// small and large segment groups, and each group is handled by a different, specialized kernel.
static constexpr unsigned int partitioning_threshold = 0;
/// \brief If set to \p true, warp sort can be used to sort the small segments, even if the total number of
/// segments is below \p PartitioningThreshold.
static constexpr bool enable_unpartitioned_warp_sort = false;
/// \brief The number of threads in the logical warp in the medium segment processing kernel.
static constexpr unsigned int logical_warp_size_medium = 1;
/// \brief The number of items processed by a thread in the medium segment processing kernel.
static constexpr unsigned int items_per_thread_medium = 1;
/// \brief The number of threads per block in the medium segment processing kernel.
static constexpr unsigned int block_size_medium = 1;
};
/// \brief Selects the appropriate \p WarpSortConfig based on the size of the key type.
///
/// \tparam Key - the type of the sorted keys.
/// \tparam MediumWarpSize - the logical warp size of the medium segment processing kernel.
template<class Key, unsigned int MediumWarpSize = ROCPRIM_WARP_SIZE_32>
using select_warp_sort_config_t
= std::conditional_t<sizeof(Key) < 2,
DisabledWarpSortConfig,
WarpSortConfig<32, //< logical warp size - small kernel
4, //< items per thread - small kernel
256, //< block size - small kernel
3000, //< partitioning threshold
(sizeof(Key) > 2), //< enable unpartitioned warp sort
MediumWarpSize, //< logical warp size - medium kernel
4, //< items per thread - medium kernel
256 //< block size - medium kernel
>>;
/// \brief Configuration of device-level segmented radix sort operation.
///
/// Radix sort is excecuted in a few iterations (passes) depending on total number of bits to be sorted
/// (\p begin_bit and \p end_bit), each iteration sorts either \p LongRadixBits or \p ShortRadixBits bits
/// choosen to cover whole bit range in optimal way.
///
/// For example, if \p LongRadixBits is 7, \p ShortRadixBits is 6, \p begin_bit is 0 and \p end_bit is 32
/// there will be 5 iterations: 7 + 7 + 6 + 6 + 6 = 32 bits.
///
/// If a segment's element count is low ( <= warp_sort_config::items_per_thread * warp_sort_config::logical_warp_size ),
/// it is sorted by a special warp-level sorting method.
///
/// \tparam LongRadixBits - number of bits in long iterations.
/// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits.
/// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config.
/// \tparam WarpSortConfig - configuration of the warp sort that is used on the short segments.
template<
unsigned int LongRadixBits,
unsigned int ShortRadixBits,
class SortConfig,
class WarpSortConfig = DisabledWarpSortConfig
>
struct segmented_radix_sort_config
{
/// \brief Number of bits in long iterations.
static constexpr unsigned int long_radix_bits = LongRadixBits;
/// \brief Number of bits in short iterations
static constexpr unsigned int short_radix_bits = ShortRadixBits;
/// \brief Configuration of radix sort kernel.
using sort = SortConfig;
/// \brief Configuration of the warp sort method.
using warp_sort_config = WarpSortConfig;
};
namespace detail
{
template<class Key, class Value>
struct segmented_radix_sort_config_803
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
segmented_radix_sort_config<8, 7, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
segmented_radix_sort_config<8, 7, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
segmented_radix_sort_config<7, 6, kernel_config<256, 13>, select_warp_sort_config_t<Key> >
>,
segmented_radix_sort_config<7, 6, kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>, select_warp_sort_config_t<Key> >
>;
};
template<class Key>
struct segmented_radix_sort_config_803<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, segmented_radix_sort_config<8, 7, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
select_type_case<sizeof(Key) == 2, segmented_radix_sort_config<8, 7, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
select_type_case<sizeof(Key) == 4, segmented_radix_sort_config<7, 6, kernel_config<256, 9>, select_warp_sort_config_t<Key> > >,
select_type_case<sizeof(Key) == 8, segmented_radix_sort_config<7, 6, kernel_config<256, 7>, select_warp_sort_config_t<Key> > >
> { };
template<class Key, class Value>
struct segmented_radix_sort_config_900
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
segmented_radix_sort_config<4, 4, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
segmented_radix_sort_config<6, 5, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
>,
segmented_radix_sort_config<7, 6, kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>, select_warp_sort_config_t<Key> >
>;
};
template<class Key>
struct segmented_radix_sort_config_900<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, segmented_radix_sort_config<4, 3, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
select_type_case<sizeof(Key) == 2, segmented_radix_sort_config<6, 5, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
select_type_case<sizeof(Key) == 4, segmented_radix_sort_config<7, 6, kernel_config<256, 17>, select_warp_sort_config_t<Key> > >,
select_type_case<sizeof(Key) == 8, segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> > >
> { };
template<class Key, class Value>
struct segmented_radix_sort_config_90a
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
segmented_radix_sort_config<4,
4,
kernel_config<256, 10>,
select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
segmented_radix_sort_config<6,
5,
kernel_config<256, 10>,
select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
segmented_radix_sort_config<7,
6,
kernel_config<256, 15>,
select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
segmented_radix_sort_config<7,
6,
kernel_config<256, 15>,
select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
segmented_radix_sort_config<7,
6,
kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>,
select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>;
};
template<class Key>
struct segmented_radix_sort_config_90a<Key, empty_type>
: select_type<
select_type_case<
sizeof(Key) == 1,
segmented_radix_sort_config<4,
3,
kernel_config<256, 10>,
select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
select_type_case<
sizeof(Key) == 2,
segmented_radix_sort_config<6,
5,
kernel_config<256, 10>,
select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
select_type_case<
sizeof(Key) == 4,
segmented_radix_sort_config<7,
6,
kernel_config<256, 17>,
select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>,
select_type_case<
sizeof(Key) == 8,
segmented_radix_sort_config<7,
6,
kernel_config<256, 15>,
select_warp_sort_config_t<Key, ROCPRIM_WARP_SIZE_64>>>>
{};
template<class Key, class Value>
struct segmented_radix_sort_config_1030
{
static constexpr unsigned int item_scale =
::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
using type = select_type<
select_type_case<
(sizeof(Key) == 1 && sizeof(Value) <= 8),
segmented_radix_sort_config<4, 4, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
>,
select_type_case<
(sizeof(Key) == 2 && sizeof(Value) <= 8),
segmented_radix_sort_config<6, 5, kernel_config<256, 10>, select_warp_sort_config_t<Key> >
>,
select_type_case<
(sizeof(Key) == 4 && sizeof(Value) <= 8),
segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
>,
select_type_case<
(sizeof(Key) == 8 && sizeof(Value) <= 8),
segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> >
>,
segmented_radix_sort_config<7, 6, kernel_config<256, ::rocprim::max(1u, 15u / item_scale)>, select_warp_sort_config_t<Key> >
>;
};
template<class Key>
struct segmented_radix_sort_config_1030<Key, empty_type>
: select_type<
select_type_case<sizeof(Key) == 1, segmented_radix_sort_config<4, 3, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
select_type_case<sizeof(Key) == 2, segmented_radix_sort_config<6, 5, kernel_config<256, 10>, select_warp_sort_config_t<Key> > >,
select_type_case<sizeof(Key) == 4, segmented_radix_sort_config<7, 6, kernel_config<256, 17>, select_warp_sort_config_t<Key> > >,
select_type_case<sizeof(Key) == 8, segmented_radix_sort_config<7, 6, kernel_config<256, 15>, select_warp_sort_config_t<Key> > >
> { };
template<unsigned int TargetArch, class Key, class Value>
struct default_segmented_radix_sort_config
: select_arch<
TargetArch,
select_arch_case<803, detail::segmented_radix_sort_config_803<Key, Value>>,
select_arch_case<900, detail::segmented_radix_sort_config_900<Key, Value>>,
select_arch_case<906, detail::segmented_radix_sort_config_90a<Key, Value>>,
select_arch_case<908, detail::segmented_radix_sort_config_90a<Key, Value>>,
select_arch_case<ROCPRIM_ARCH_90a, detail::segmented_radix_sort_config_90a<Key, Value>>,
select_arch_case<1030, detail::segmented_radix_sort_config_1030<Key, Value>>,
detail::segmented_radix_sort_config_900<Key, Value>>
{};
} // end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_CONFIG_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment