Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
GLM-130B_fastertransformer
Commits
f8a481f8
Commit
f8a481f8
authored
Oct 13, 2023
by
zhouxiang
Browse files
添加dtk中的cub头文件
parent
7b7c64c5
Changes
147
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
9680 additions
and
0 deletions
+9680
-0
3rdparty/cub/rocprim/device/device_histogram_config.hpp
3rdparty/cub/rocprim/device/device_histogram_config.hpp
+128
-0
3rdparty/cub/rocprim/device/device_merge.hpp
3rdparty/cub/rocprim/device/device_merge.hpp
+438
-0
3rdparty/cub/rocprim/device/device_merge_config.hpp
3rdparty/cub/rocprim/device/device_merge_config.hpp
+159
-0
3rdparty/cub/rocprim/device/device_merge_sort.hpp
3rdparty/cub/rocprim/device/device_merge_sort.hpp
+590
-0
3rdparty/cub/rocprim/device/device_merge_sort_config.hpp
3rdparty/cub/rocprim/device/device_merge_sort_config.hpp
+223
-0
3rdparty/cub/rocprim/device/device_partition.hpp
3rdparty/cub/rocprim/device/device_partition.hpp
+707
-0
3rdparty/cub/rocprim/device/device_radix_sort.hpp
3rdparty/cub/rocprim/device/device_radix_sort.hpp
+1677
-0
3rdparty/cub/rocprim/device/device_radix_sort_config.hpp
3rdparty/cub/rocprim/device/device_radix_sort_config.hpp
+390
-0
3rdparty/cub/rocprim/device/device_reduce.hpp
3rdparty/cub/rocprim/device/device_reduce.hpp
+496
-0
3rdparty/cub/rocprim/device/device_reduce_by_key.hpp
3rdparty/cub/rocprim/device/device_reduce_by_key.hpp
+413
-0
3rdparty/cub/rocprim/device/device_reduce_by_key_config.hpp
3rdparty/cub/rocprim/device/device_reduce_by_key_config.hpp
+143
-0
3rdparty/cub/rocprim/device/device_reduce_config.hpp
3rdparty/cub/rocprim/device/device_reduce_config.hpp
+115
-0
3rdparty/cub/rocprim/device/device_run_length_encode.hpp
3rdparty/cub/rocprim/device/device_run_length_encode.hpp
+411
-0
3rdparty/cub/rocprim/device/device_run_length_encode_config.hpp
...ty/cub/rocprim/device/device_run_length_encode_config.hpp
+66
-0
3rdparty/cub/rocprim/device/device_scan.hpp
3rdparty/cub/rocprim/device/device_scan.hpp
+826
-0
3rdparty/cub/rocprim/device/device_scan_by_key.hpp
3rdparty/cub/rocprim/device/device_scan_by_key.hpp
+558
-0
3rdparty/cub/rocprim/device/device_scan_by_key_config.hpp
3rdparty/cub/rocprim/device/device_scan_by_key_config.hpp
+158
-0
3rdparty/cub/rocprim/device/device_scan_config.hpp
3rdparty/cub/rocprim/device/device_scan_config.hpp
+180
-0
3rdparty/cub/rocprim/device/device_segmented_radix_sort.hpp
3rdparty/cub/rocprim/device/device_segmented_radix_sort.hpp
+1640
-0
3rdparty/cub/rocprim/device/device_segmented_radix_sort_config.hpp
...cub/rocprim/device/device_segmented_radix_sort_config.hpp
+362
-0
No files found.
Too many changes to show.
To preserve performance only
147 of 147+
files are displayed.
Plain diff
Email patch
3rdparty/cub/rocprim/device/device_histogram_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_HISTOGRAM_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_HISTOGRAM_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level histogram operation.
///
/// \tparam HistogramConfig - configuration of histogram kernel. Must be \p kernel_config.
/// \tparam MaxGridSize - maximim number of blocks to launch.
/// \tparam SharedImplMaxBins - maximum total number of bins for all active channels
/// for the shared memory histogram implementation (samples -> shared memory bins -> global memory bins),
/// when exceeded the global memory implementation is used (samples -> global memory bins).
template
<
class
HistogramConfig
,
unsigned
int
MaxGridSize
=
1024
,
unsigned
int
SharedImplMaxBins
=
2048
>
struct
histogram_config
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS
using
histogram
=
HistogramConfig
;
static
constexpr
unsigned
int
max_grid_size
=
MaxGridSize
;
static
constexpr
unsigned
int
shared_impl_max_bins
=
SharedImplMaxBins
;
#endif
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS
template
<
class
HistogramConfig
,
unsigned
int
MaxGridSize
,
unsigned
int
SharedImplMaxBins
>
constexpr
unsigned
int
histogram_config
<
HistogramConfig
,
MaxGridSize
,
SharedImplMaxBins
>::
max_grid_size
;
template
<
class
HistogramConfig
,
unsigned
int
MaxGridSize
,
unsigned
int
SharedImplMaxBins
>
constexpr
unsigned
int
histogram_config
<
HistogramConfig
,
MaxGridSize
,
SharedImplMaxBins
>::
shared_impl_max_bins
;
#endif
namespace
detail
{
template
<
class
Sample
,
unsigned
int
Channels
,
unsigned
int
ActiveChannels
>
struct
histogram_config_803
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
(
sizeof
(
Sample
),
sizeof
(
int
));
using
type
=
histogram_config
<
kernel_config
<
256
,
::
rocprim
::
max
(
10u
/
Channels
/
item_scale
,
1u
)
>>
;
};
template
<
class
Sample
,
unsigned
int
Channels
,
unsigned
int
ActiveChannels
>
struct
histogram_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
(
sizeof
(
Sample
),
sizeof
(
int
));
using
type
=
histogram_config
<
kernel_config
<
256
,
::
rocprim
::
max
(
8u
/
Channels
/
item_scale
,
1u
)
>>
;
};
// TODO: We need to update these parameters
template
<
class
Sample
,
unsigned
int
Channels
,
unsigned
int
ActiveChannels
>
struct
histogram_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
(
sizeof
(
Sample
),
sizeof
(
int
));
using
type
=
histogram_config
<
kernel_config
<
256
,
::
rocprim
::
max
(
8u
/
Channels
/
item_scale
,
1u
)
>>
;
};
// TODO: We need to update these parameters
template
<
class
Sample
,
unsigned
int
Channels
,
unsigned
int
ActiveChannels
>
struct
histogram_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
(
sizeof
(
Sample
),
sizeof
(
int
));
using
type
=
histogram_config
<
kernel_config
<
256
,
::
rocprim
::
max
(
8u
/
Channels
/
item_scale
,
1u
)
>>
;
};
template
<
unsigned
int
TargetArch
,
class
Sample
,
unsigned
int
Channels
,
unsigned
int
ActiveChannels
>
struct
default_histogram_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
histogram_config_803
<
Sample
,
Channels
,
ActiveChannels
>
>
,
select_arch_case
<
900
,
histogram_config_900
<
Sample
,
Channels
,
ActiveChannels
>
>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
histogram_config_90a
<
Sample
,
Channels
,
ActiveChannels
>
>
,
select_arch_case
<
1030
,
histogram_config_1030
<
Sample
,
Channels
,
ActiveChannels
>
>
,
histogram_config_900
<
Sample
,
Channels
,
ActiveChannels
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_HISTOGRAM_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_merge.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_MERGE_HPP_
#define ROCPRIM_DEVICE_DEVICE_MERGE_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "device_merge_config.hpp"
#include "detail/device_merge.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
template
<
class
IndexIterator
,
class
KeysInputIterator1
,
class
KeysInputIterator2
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
ROCPRIM_DEFAULT_MAX_BLOCK_SIZE
)
void
partition_kernel
(
IndexIterator
index
,
KeysInputIterator1
keys_input1
,
KeysInputIterator2
keys_input2
,
const
size_t
input1_size
,
const
size_t
input2_size
,
const
unsigned
int
spacing
,
BinaryFunction
compare_function
)
{
partition_kernel_impl
(
index
,
keys_input1
,
keys_input2
,
input1_size
,
input2_size
,
spacing
,
compare_function
);
}
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
class
IndexIterator
,
class
KeysInputIterator1
,
class
KeysInputIterator2
,
class
KeysOutputIterator
,
class
ValuesInputIterator1
,
class
ValuesInputIterator2
,
class
ValuesOutputIterator
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
merge_kernel
(
IndexIterator
index
,
KeysInputIterator1
keys_input1
,
KeysInputIterator2
keys_input2
,
KeysOutputIterator
keys_output
,
ValuesInputIterator1
values_input1
,
ValuesInputIterator2
values_input2
,
ValuesOutputIterator
values_output
,
const
size_t
input1_size
,
const
size_t
input2_size
,
BinaryFunction
compare_function
)
{
merge_kernel_impl
<
BlockSize
,
ItemsPerThread
>
(
index
,
keys_input1
,
keys_input2
,
keys_output
,
values_input1
,
values_input2
,
values_output
,
input1_size
,
input2_size
,
compare_function
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template
<
class
Config
,
class
KeysInputIterator1
,
class
KeysInputIterator2
,
class
KeysOutputIterator
,
class
ValuesInputIterator1
,
class
ValuesInputIterator2
,
class
ValuesOutputIterator
,
class
BinaryFunction
>
inline
cudaError_t
merge_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator1
keys_input1
,
KeysInputIterator2
keys_input2
,
KeysOutputIterator
keys_output
,
ValuesInputIterator1
values_input1
,
ValuesInputIterator2
values_input2
,
ValuesOutputIterator
values_output
,
const
size_t
input1_size
,
const
size_t
input2_size
,
BinaryFunction
compare_function
,
const
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator1
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValuesInputIterator1
>::
value_type
;
// Get default config if Config is default_config
using
config
=
detail
::
default_or_custom_config
<
Config
,
detail
::
default_merge_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
value_type
>
>
;
static
constexpr
unsigned
int
block_size
=
config
::
block_size
;
static
constexpr
unsigned
int
half_block
=
block_size
/
2
;
static
constexpr
unsigned
int
items_per_thread
=
config
::
items_per_thread
;
static
constexpr
auto
items_per_block
=
block_size
*
items_per_thread
;
const
unsigned
int
partitions
=
((
input1_size
+
input2_size
)
+
items_per_block
-
1
)
/
items_per_block
;
const
size_t
partition_bytes
=
(
partitions
+
1
)
*
sizeof
(
unsigned
int
);
if
(
temporary_storage
==
nullptr
)
{
// storage_size is never zero
storage_size
=
partition_bytes
;
return
cudaSuccess
;
}
if
(
partitions
==
0u
)
return
cudaSuccess
;
// Start point for time measurements
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
auto
number_of_blocks
=
partitions
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"block_size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"items_per_block "
<<
items_per_block
<<
'\n'
;
}
unsigned
int
*
index
=
reinterpret_cast
<
unsigned
int
*>
(
temporary_storage
);
const
unsigned
partition_blocks
=
((
partitions
+
1
)
+
half_block
-
1
)
/
half_block
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
detail
::
partition_kernel
<<<
dim3
(
partition_blocks
),
dim3
(
half_block
),
0
,
stream
>>>
(
index
,
keys_input1
,
keys_input2
,
input1_size
,
input2_size
,
items_per_block
,
compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"partition_kernel"
,
input1_size
,
start
);
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
detail
::
merge_kernel
<
block_size
,
items_per_thread
>
<<<
dim3
(
number_of_blocks
),
dim3
(
block_size
),
0
,
stream
>>>
(
index
,
keys_input1
,
keys_input2
,
keys_output
,
values_input1
,
values_input2
,
values_output
,
input1_size
,
input2_size
,
compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"merge_kernel"
,
input1_size
,
start
);
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
}
// end of detail namespace
/// \brief Parallel merge primitive for device level.
///
/// \p merge function performs a device-wide merge.
/// Function merges two ordered sets of input values based on comparison function.
///
/// \par Overview
/// * The contents of the inputs are not altered by the merging function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Accepts custom compare_functions for merging across the device.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or
/// a custom class with the same members.
/// \tparam InputIterator1 - random-access iterator type of the first input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam InputIterator2 - random-access iterator type of the second input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input1 - iterator to the first element in the first range to merge.
/// \param [in] input2 - iterator to the first element in the second range to merge.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] input1_size - number of element in the first input range.
/// \param [in] input2_size - number of element in the second input range.
/// \param [in] compare_function - binary operation function object that will be used for comparison.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending merge is performed on an array of
/// \p int values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size1; // e.g., 4
/// size_t input_size2; // e.g., 4
/// int * input1; // e.g., [0, 1, 2, 3]
/// int * input2; // e.g., [0, 1, 2, 3]
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::merge(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input1, input2, output, input_size1, input_size2
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform merge
/// rocprim::merge(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input1, input2, output, input_size1, input_size2
/// );
/// // output: [0, 0, 1, 1, 2, 2, 3, 3]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator1
,
class
InputIterator2
,
class
OutputIterator
,
class
BinaryFunction
=
::
rocprim
::
less
<
typename
std
::
iterator_traits
<
InputIterator1
>
::
value_type
>
>
inline
cudaError_t
merge
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator1
input1
,
InputIterator2
input2
,
OutputIterator
output
,
const
size_t
input1_size
,
const
size_t
input2_size
,
BinaryFunction
compare_function
=
BinaryFunction
(),
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
empty_type
*
values
=
nullptr
;
return
detail
::
merge_impl
<
Config
>
(
temporary_storage
,
storage_size
,
input1
,
input2
,
output
,
values
,
values
,
values
,
input1_size
,
input2_size
,
compare_function
,
stream
,
debug_synchronous
);
}
/// \brief Parallel merge primitive for device level.
///
/// \p merge function performs a device-wide merge of (key, value) pairs.
/// Function merges two ordered sets of input keys and corresponding values
/// based on key comparison function.
///
/// \par Overview
/// * The contents of the inputs are not altered by the merging function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Accepts custom compare_functions for merging across the device.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator1 - random-access iterator type of the first keys input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysInputIterator2 - random-access iterator type of the second keys input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the keys output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator1 - random-access iterator type of the first values input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator2 - random-access iterator type of the second values input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the values output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input1 - iterator to the first key in the first range to merge.
/// \param [in] keys_input2 - iterator to the first key in the second range to merge.
/// \param [out] keys_output - iterator to the first key in the output range.
/// \param [in] values_input1 - iterator to the first value in the first range to merge.
/// \param [in] values_input2 - iterator to the first value in the second range to merge.
/// \param [out] values_output - iterator to the first value in the output range.
/// \param [in] input1_size - number of element in the first input range.
/// \param [in] input2_size - number of element in the second input range.
/// \param [in] compare_function - binary operation function object that will be used for key comparison.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending merge is performed on an array of
/// \p int values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size1; // e.g., 4
/// size_t input_size2; // e.g., 4
/// int * keys_input1; // e.g., [0, 1, 2, 3]
/// int * keys_input2; // e.g., [0, 1, 2, 3]
/// int * keys_output; // empty array of 8 elements
/// int * values_input1; // e.g., [10, 11, 12, 13]
/// int * values_input2; // e.g., [20, 21, 22, 23]
/// int * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::merge(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input1, keys_input2, keys_output,
/// values_input1, values_input2, values_output,
// input_size1, input_size2
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform merge
/// rocprim::merge(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input1, keys_input2, keys_output,
/// values_input1, values_input2, values_output,
// input_size1, input_size2
/// );
/// // keys_output: [0, 0, 1, 1, 2, 2, 3, 3]
/// // values_output: [10, 20, 11, 21, 12, 22, 13, 23]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator1
,
class
KeysInputIterator2
,
class
KeysOutputIterator
,
class
ValuesInputIterator1
,
class
ValuesInputIterator2
,
class
ValuesOutputIterator
,
class
BinaryFunction
=
::
rocprim
::
less
<
typename
std
::
iterator_traits
<
KeysInputIterator1
>
::
value_type
>
>
inline
cudaError_t
merge
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator1
keys_input1
,
KeysInputIterator2
keys_input2
,
KeysOutputIterator
keys_output
,
ValuesInputIterator1
values_input1
,
ValuesInputIterator2
values_input2
,
ValuesOutputIterator
values_output
,
const
size_t
input1_size
,
const
size_t
input2_size
,
BinaryFunction
compare_function
=
BinaryFunction
(),
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
return
detail
::
merge_impl
<
Config
>
(
temporary_storage
,
storage_size
,
keys_input1
,
keys_input2
,
keys_output
,
values_input1
,
values_input2
,
values_output
,
input1_size
,
input2_size
,
compare_function
,
stream
,
debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_MERGE_HPP_
3rdparty/cub/rocprim/device/device_merge_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_MERGE_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_MERGE_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level merge primitives.
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
>
using
merge_config
=
kernel_config
<
BlockSize
,
ItemsPerThread
>
;
namespace
detail
{
template
<
class
Key
,
class
Value
>
struct
merge_config_803
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
// TODO Tune when merge-by-key is ready
using
type
=
merge_config
<
256
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
;
};
template
<
class
Key
>
struct
merge_config_803
<
Key
,
empty_type
>
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
),
sizeof
(
int
));
using
type
=
select_type
<
select_type_case
<
sizeof
(
Key
)
<=
2
,
merge_config
<
256
,
11
>
>
,
select_type_case
<
sizeof
(
Key
)
<=
4
,
merge_config
<
256
,
10
>
>
,
select_type_case
<
sizeof
(
Key
)
<=
8
,
merge_config
<
256
,
7
>
>
,
merge_config
<
256
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
>
;
};
template
<
class
Key
,
class
Value
>
struct
merge_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
// TODO Tune when merge-by-key is ready
using
type
=
merge_config
<
256
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
;
};
template
<
class
Key
>
struct
merge_config_900
<
Key
,
empty_type
>
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
),
sizeof
(
int
));
using
type
=
select_type
<
select_type_case
<
sizeof
(
Key
)
<=
2
,
merge_config
<
256
,
11
>
>
,
select_type_case
<
sizeof
(
Key
)
<=
4
,
merge_config
<
256
,
10
>
>
,
select_type_case
<
sizeof
(
Key
)
<=
8
,
merge_config
<
256
,
7
>
>
,
merge_config
<
256
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
>
;
};
// TODO: We need to update these parameters
template
<
class
Key
,
class
Value
>
struct
merge_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
// TODO Tune when merge-by-key is ready
using
type
=
merge_config
<
256
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
;
};
template
<
class
Key
>
struct
merge_config_90a
<
Key
,
empty_type
>
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
),
sizeof
(
int
));
using
type
=
select_type
<
select_type_case
<
sizeof
(
Key
)
<=
2
,
merge_config
<
256
,
11
>
>
,
select_type_case
<
sizeof
(
Key
)
<=
4
,
merge_config
<
256
,
10
>
>
,
select_type_case
<
sizeof
(
Key
)
<=
8
,
merge_config
<
256
,
7
>
>
,
merge_config
<
256
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
>
;
};
// TODO: We need to update these parameters
template
<
class
Key
,
class
Value
>
struct
merge_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
// TODO Tune when merge-by-key is ready
using
type
=
merge_config
<
256
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
;
};
template
<
class
Key
>
struct
merge_config_1030
<
Key
,
empty_type
>
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
),
sizeof
(
int
));
using
type
=
select_type
<
select_type_case
<
sizeof
(
Key
)
<=
2
,
merge_config
<
256
,
11
>
>
,
select_type_case
<
sizeof
(
Key
)
<=
4
,
merge_config
<
256
,
10
>
>
,
select_type_case
<
sizeof
(
Key
)
<=
8
,
merge_config
<
256
,
7
>
>
,
merge_config
<
256
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
>
;
};
template
<
unsigned
int
TargetArch
,
class
Key
,
class
Value
>
struct
default_merge_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
merge_config_803
<
Key
,
Value
>>
,
select_arch_case
<
900
,
merge_config_900
<
Key
,
Value
>>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
merge_config_90a
<
Key
,
Value
>>
,
select_arch_case
<
1030
,
merge_config_1030
<
Key
,
Value
>>
,
merge_config_900
<
Key
,
Value
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_MERGE_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_merge_sort.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SORT_HPP_
#define ROCPRIM_DEVICE_DEVICE_SORT_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "detail/device_merge.hpp"
#include "detail/device_merge_sort.hpp"
#include "detail/device_merge_sort_mergepath.hpp"
#include "device_transform.hpp"
#include "device_merge_sort_config.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
OffsetT
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
block_sort_kernel
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
const
OffsetT
size
,
BinaryFunction
compare_function
)
{
block_sort_kernel_impl
<
BlockSize
,
ItemsPerThread
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
compare_function
);
}
template
<
unsigned
int
BlockSize
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
OffsetT
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
block_merge_kernel
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
const
OffsetT
input_size
,
const
OffsetT
sorted_block_size
,
BinaryFunction
compare_function
)
{
block_merge_kernel_impl
<
BlockSize
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
input_size
,
sorted_block_size
,
compare_function
);
}
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
OffsetT
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
block_merge_kernel
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
const
OffsetT
input_size
,
const
OffsetT
sorted_block_size
,
BinaryFunction
compare_function
,
const
OffsetT
*
merge_partitions
)
{
block_merge_kernel_impl
<
BlockSize
,
ItemsPerThread
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
input_size
,
sorted_block_size
,
compare_function
,
merge_partitions
);
}
#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template
<
unsigned
int
BlockSize
,
// BlockSize of the partition kernel
unsigned
int
ItemsPerTile
,
// ItemsPerTile of the block merge kernel
typename
KeysInputIterator
,
typename
OffsetT
,
typename
CompareOpT
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
device_mergepath_partition_kernel
(
KeysInputIterator
keys
,
const
OffsetT
input_size
,
const
unsigned
int
num_partitions
,
OffsetT
*
merge_partitions
,
const
CompareOpT
compare_op
,
const
OffsetT
sorted_block_size
)
{
const
OffsetT
partition_id
=
blockIdx
.
x
*
BlockSize
+
threadIdx
.
x
;
if
(
partition_id
>=
num_partitions
)
{
return
;
}
const
unsigned
int
merged_tiles
=
sorted_block_size
/
ItemsPerTile
;
const
unsigned
int
target_merged_tiles
=
merged_tiles
*
2
;
const
unsigned
int
mask
=
target_merged_tiles
-
1
;
const
unsigned
int
tilegroup_start_id
=
~
mask
&
partition_id
;
// id of the first tile in the current tile-group
const
OffsetT
tilegroup_start
=
ItemsPerTile
*
tilegroup_start_id
;
// index of the first item in the current tile-group
const
unsigned
int
local_tile_id
=
mask
&
partition_id
;
// id of the current tile in the current tile-group
const
OffsetT
keys1_beg
=
rocprim
::
min
(
input_size
,
tilegroup_start
);
const
OffsetT
keys1_end
=
rocprim
::
min
(
input_size
,
tilegroup_start
+
sorted_block_size
);
const
OffsetT
keys2_beg
=
keys1_end
;
const
OffsetT
keys2_end
=
rocprim
::
min
(
input_size
,
keys2_beg
+
sorted_block_size
);
const
OffsetT
partition_at
=
rocprim
::
min
<
OffsetT
>
(
keys2_end
-
keys1_beg
,
ItemsPerTile
*
local_tile_id
);
const
OffsetT
partition_diag
=
::
rocprim
::
detail
::
merge_path
(
keys
+
keys1_beg
,
keys
+
keys2_beg
,
keys1_end
-
keys1_beg
,
keys2_end
-
keys2_beg
,
partition_at
,
compare_op
);
merge_partitions
[
partition_id
]
=
keys1_beg
+
partition_diag
;
}
template
<
class
Config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
BinaryFunction
>
inline
cudaError_t
merge_sort_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
const
unsigned
int
size
,
BinaryFunction
compare_function
,
const
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
OffsetT
=
unsigned
int
;
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
;
constexpr
bool
with_values
=
!
std
::
is_same
<
value_type
,
::
rocprim
::
empty_type
>::
value
;
// Get default config if Config is default_config
using
config
=
default_or_custom_config
<
Config
,
default_merge_sort_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
value_type
>
>
;
static
constexpr
unsigned
int
sort_block_size
=
config
::
sort_config
::
block_size
;
static
constexpr
unsigned
int
sort_items_per_thread
=
config
::
sort_config
::
items_per_thread
;
static
constexpr
unsigned
int
sort_items_per_block
=
sort_block_size
*
sort_items_per_thread
;
static
constexpr
unsigned
int
merge_impl1_block_size
=
config
::
merge_impl1_config
::
block_size
;
static
constexpr
unsigned
int
merge_impl1_items_per_thread
=
config
::
merge_impl1_config
::
items_per_thread
;
static
constexpr
unsigned
int
merge_impl1_items_per_block
=
merge_impl1_block_size
*
merge_impl1_items_per_thread
;
static
constexpr
unsigned
int
merge_partition_block_size
=
config
::
merge_mergepath_partition_config
::
block_size
;
static
constexpr
unsigned
int
merge_mergepath_block_size
=
config
::
merge_mergepath_config
::
block_size
;
static
constexpr
unsigned
int
merge_mergepath_items_per_thread
=
config
::
merge_mergepath_config
::
items_per_thread
;
static
constexpr
unsigned
int
merge_mergepath_items_per_block
=
merge_mergepath_block_size
*
merge_mergepath_items_per_thread
;
static_assert
(
merge_mergepath_items_per_block
>=
sort_items_per_block
,
"merge_mergepath_items_per_block must be greater than or equal to sort_items_per_block"
);
static_assert
(
sort_items_per_block
%
config
::
merge_impl1_config
::
block_size
==
0
,
"Merge block size must be a divisor of the items per block of the sort step"
);
const
size_t
keys_bytes
=
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
key_type
));
const
size_t
values_bytes
=
with_values
?
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
value_type
))
:
0
;
const
unsigned
int
sort_number_of_blocks
=
ceiling_div
(
size
,
sort_items_per_block
);
const
unsigned
int
merge_impl1_number_of_blocks
=
ceiling_div
(
size
,
merge_impl1_items_per_block
);
const
unsigned
int
merge_mergepath_number_of_blocks
=
ceiling_div
(
size
,
merge_mergepath_items_per_block
);
bool
use_mergepath
=
size
>
config
::
min_input_size_mergepath
;
// variables below used for mergepath
const
unsigned
int
merge_num_partitions
=
merge_mergepath_number_of_blocks
+
1
;
const
unsigned
int
merge_partition_number_of_blocks
=
ceiling_div
(
merge_num_partitions
,
merge_partition_block_size
);
const
size_t
d_merge_partitions_bytes
=
use_mergepath
?
merge_num_partitions
*
sizeof
(
OffsetT
)
:
0
;
if
(
temporary_storage
==
nullptr
)
{
storage_size
=
d_merge_partitions_bytes
+
keys_bytes
+
values_bytes
;
// Make sure user won't try to allocate 0 bytes memory
storage_size
=
storage_size
==
0
?
4
:
storage_size
;
return
cudaSuccess
;
}
if
(
size
==
size_t
(
0
)
)
return
cudaSuccess
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"-----"
<<
'\n'
;
std
::
cout
<<
"size: "
<<
size
<<
'\n'
;
std
::
cout
<<
"sort_block_size: "
<<
sort_block_size
<<
'\n'
;
std
::
cout
<<
"sort_items_per_thread: "
<<
sort_items_per_thread
<<
'\n'
;
std
::
cout
<<
"sort_items_per_block: "
<<
sort_items_per_block
<<
'\n'
;
std
::
cout
<<
"sort_number_of_blocks: "
<<
sort_number_of_blocks
<<
'\n'
;
std
::
cout
<<
"merge_impl1_block_size: "
<<
merge_impl1_block_size
<<
'\n'
;
std
::
cout
<<
"merge_impl1_number_of_blocks: "
<<
merge_impl1_number_of_blocks
<<
'\n'
;
std
::
cout
<<
"merge_impl1_items_per_thread: "
<<
merge_impl1_items_per_thread
<<
'\n'
;
std
::
cout
<<
"merge_impl1_items_per_block: "
<<
merge_impl1_items_per_block
<<
'\n'
;
std
::
cout
<<
"merge_mergepath_block_size: "
<<
merge_mergepath_block_size
<<
'\n'
;
std
::
cout
<<
"merge_mergepath_number_of_blocks: "
<<
merge_mergepath_number_of_blocks
<<
'\n'
;
std
::
cout
<<
"merge_mergepath_items_per_thread: "
<<
merge_mergepath_items_per_thread
<<
'\n'
;
std
::
cout
<<
"merge_mergepath_items_per_block: "
<<
merge_mergepath_items_per_block
<<
'\n'
;
std
::
cout
<<
"num_partitions: "
<<
merge_num_partitions
<<
'\n'
;
std
::
cout
<<
"merge_mergepath_partition_block_size: "
<<
merge_partition_block_size
<<
'\n'
;
std
::
cout
<<
"merge_mergepath_partition_number_of_blocks: "
<<
merge_partition_number_of_blocks
<<
'\n'
;
}
char
*
ptr
=
reinterpret_cast
<
char
*>
(
temporary_storage
);
OffsetT
*
d_merge_partitions
=
reinterpret_cast
<
OffsetT
*>
(
ptr
);
ptr
+=
d_merge_partitions_bytes
;
key_type
*
keys_buffer
=
reinterpret_cast
<
key_type
*>
(
ptr
);
ptr
+=
keys_bytes
;
value_type
*
values_buffer
=
with_values
?
reinterpret_cast
<
value_type
*>
(
ptr
)
:
nullptr
;
// Start point for time measurements
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
block_sort_kernel
<
sort_block_size
,
sort_items_per_thread
>
<<<
dim3
(
sort_number_of_blocks
),
dim3
(
sort_block_size
),
0
,
stream
>>>
(
keys_input
,
keys_buffer
,
values_input
,
values_buffer
,
size
,
compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"block_sort_kernel"
,
size
,
start
);
bool
temporary_store
=
true
;
for
(
OffsetT
block
=
sort_items_per_block
;
block
<
size
;
block
*=
2
)
{
temporary_store
=
!
temporary_store
;
const
auto
merge_step
=
[
&
](
auto
keys_input_
,
auto
keys_output_
,
auto
values_input_
,
auto
values_output_
)
->
cudaError_t
{
if
(
use_mergepath
)
{
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
device_mergepath_partition_kernel
<
merge_partition_block_size
,
merge_mergepath_items_per_block
>
<<<
dim3
(
merge_partition_number_of_blocks
),
dim3
(
merge_partition_block_size
),
0
,
stream
>>>
(
keys_input_
,
size
,
merge_num_partitions
,
d_merge_partitions
,
compare_function
,
block
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"device_mergepath_partition_kernel"
,
size
,
start
);
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
block_merge_kernel
<
merge_mergepath_block_size
,
merge_mergepath_items_per_thread
>
<<<
dim3
(
merge_mergepath_number_of_blocks
),
dim3
(
merge_mergepath_block_size
),
0
,
stream
>>>
(
keys_input_
,
keys_output_
,
values_input_
,
values_output_
,
size
,
block
,
compare_function
,
d_merge_partitions
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"block_merge_kernel"
,
size
,
start
);
}
else
{
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
block_merge_kernel
<
merge_impl1_block_size
>
<<<
dim3
(
merge_impl1_number_of_blocks
),
dim3
(
merge_impl1_block_size
),
0
,
stream
>>>
(
keys_input_
,
keys_output_
,
values_input_
,
values_output_
,
size
,
block
,
compare_function
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"block_merge_kernel"
,
size
,
start
)
}
return
cudaSuccess
;
};
cudaError_t
error
;
if
(
temporary_store
)
{
error
=
merge_step
(
keys_output
,
keys_buffer
,
values_output
,
values_buffer
);
}
else
{
error
=
merge_step
(
keys_buffer
,
keys_output
,
values_buffer
,
values_output
);
}
if
(
error
!=
cudaSuccess
)
return
error
;
}
if
(
temporary_store
)
{
cudaError_t
error
=
::
rocprim
::
transform
(
keys_buffer
,
keys_output
,
size
,
::
rocprim
::
identity
<
key_type
>
(),
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
if
(
with_values
)
{
cudaError_t
error
=
::
rocprim
::
transform
(
values_buffer
,
values_output
,
size
,
::
rocprim
::
identity
<
value_type
>
(),
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
}
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#undef ROCPRIM_DETAIL_HIP_SYNC
}
// end of detail namespace
/// \brief Parallel merge sort primitive for device level.
///
/// \p merge_sort function performs a device-wide merge sort
/// of keys. Function sorts input keys based on comparison function.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Accepts custom compare_functions for sorting across the device.
///
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] compare_function - binary operation function object that will be used for comparison.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending merge sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::merge_sort(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::merge_sort(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
/// // keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
BinaryFunction
=
::
rocprim
::
less
<
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
>
inline
cudaError_t
merge_sort
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
const
size_t
size
,
BinaryFunction
compare_function
=
BinaryFunction
(),
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
empty_type
*
values
=
nullptr
;
return
detail
::
merge_sort_impl
<
Config
>
(
temporary_storage
,
storage_size
,
keys_input
,
keys_output
,
values
,
values
,
size
,
compare_function
,
stream
,
debug_synchronous
);
}
/// \brief Parallel ascending merge sort-by-key primitive for device level.
///
/// \p merge_sort function performs a device-wide merge sort
/// of (key, value) pairs. Function sorts input pairs based on comparison function.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Accepts custom compare_functions for sorting across the device.
///
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] compare_function - binary operation function object that will be used for comparison.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending merge sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 2, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::merge_sort(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::merge_sort(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size
/// );
/// // keys_output: [ 1, 2, 3, 4, 5, 6, 7, 8]
/// // values_output: [-1, -2, 2, 3, -4, -5, 7, -8]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
BinaryFunction
=
::
rocprim
::
less
<
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
>
inline
cudaError_t
merge_sort
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
const
size_t
size
,
BinaryFunction
compare_function
=
BinaryFunction
(),
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
return
detail
::
merge_sort_impl
<
Config
>
(
temporary_storage
,
storage_size
,
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
compare_function
,
stream
,
debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SORT_HPP_
3rdparty/cub/rocprim/device/device_merge_sort_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_MERGE_SORT_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_MERGE_SORT_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../functional.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
template
<
unsigned
int
SortBlockSize
,
unsigned
int
SortItemsPerThread
,
unsigned
int
MergeImpl1BlockSize
,
unsigned
int
MergeImplMPPartitionBlockSize
,
unsigned
int
MergeImplMPBlockSize
,
unsigned
int
MergeImplMPItemsPerThread
,
unsigned
int
MinInputSizeMergepath
>
struct
merge_sort_config_impl
{
using
sort_config
=
kernel_config
<
SortBlockSize
,
SortItemsPerThread
>
;
using
merge_impl1_config
=
kernel_config
<
MergeImpl1BlockSize
,
1
>
;
using
merge_mergepath_partition_config
=
kernel_config
<
MergeImplMPPartitionBlockSize
,
1
>
;
using
merge_mergepath_config
=
kernel_config
<
MergeImplMPBlockSize
,
MergeImplMPItemsPerThread
>
;
static
constexpr
unsigned
int
min_input_size_mergepath
=
MinInputSizeMergepath
;
};
}
/// \brief Configuration of device-level merge primitives.
///
/// \tparam SortBlockSize - block size in the block-sort step
/// \tparam SortItemsPerThread - ItemsPerThread in the block-sort step
/// \tparam MergeImpl1BlockSize - block size in the block merge step using impl1 (used when input_size < MinInputSizeMergepath)
/// \tparam MergeImplMPPartitionBlockSize - block size of the partition kernel in the block merge step using mergepath impl
/// \tparam MergeImplMPBlockSize - block size in the block merge step using mergepath impl
/// \tparam MergeImplMPItemsPerThread - ItemsPerThread in the block merge step using mergepath impl
/// \tparam MinInputSizeMergepath - breakpoint of input-size to use mergepath impl for block merge step
template
<
unsigned
int
MergeImpl1BlockSize
=
512
,
unsigned
int
SortBlockSize
=
MergeImpl1BlockSize
,
unsigned
int
SortItemsPerThread
=
1
,
unsigned
int
MergeImplMPPartitionBlockSize
=
128
,
unsigned
int
MergeImplMPBlockSize
=
std
::
min
(
SortBlockSize
,
128u
),
unsigned
int
MergeImplMPItemsPerThread
=
SortBlockSize
*
SortItemsPerThread
/
MergeImplMPBlockSize
,
unsigned
int
MinInputSizeMergepath
=
200000
>
using
merge_sort_config
=
detail
::
merge_sort_config_impl
<
SortBlockSize
,
SortItemsPerThread
,
MergeImpl1BlockSize
,
MergeImplMPPartitionBlockSize
,
MergeImplMPBlockSize
,
MergeImplMPItemsPerThread
,
MinInputSizeMergepath
>
;
namespace
detail
{
template
<
class
Key
,
class
Value
>
struct
merge_sort_config_803
{
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
merge_sort_config
<
64U
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
merge_sort_config
<
256U
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
merge_sort_config
<
512U
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
merge_sort_config
<
1024U
>
>
,
merge_sort_config
<
limit_block_size
<
1024U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
>
>
;
};
template
<
class
Value
>
struct
merge_sort_config_803
<
rocprim
::
half
,
Value
>
{
using
type
=
merge_sort_config
<
limit_block_size
<
256U
,
sizeof
(
rocprim
::
half
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
>
;
};
template
<
class
Key
>
struct
merge_sort_config_803
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
merge_sort_config
<
64U
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
merge_sort_config
<
256U
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
merge_sort_config
<
256U
>
>
,
select_type_case
<
sizeof
(
Key
)
>=
8
,
merge_sort_config
<
limit_block_size
<
512U
,
sizeof
(
Key
),
ROCPRIM_WARP_SIZE_64
>::
value
>
>
>
{
};
template
<
>
struct
merge_sort_config_803
<
rocprim
::
half
,
empty_type
>
{
using
type
=
merge_sort_config
<
256U
>
;
};
template
<
class
Key
,
class
Value
,
bool
=
is_scalar
<
Key
>
::
value
>
struct
merge_sort_config_900
{
using
type
=
select_type
<
// clang-format off
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
16
),
merge_sort_config
<
512U
,
512U
,
2U
>>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
16
),
merge_sort_config
<
512U
,
256U
,
4U
>>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
16
),
merge_sort_config
<
512U
,
256U
,
4U
>>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
16
),
merge_sort_config
<
256U
,
256U
,
4U
>>
,
// clang-format on
merge_sort_config
<
limit_block_size
<
1024U
,
::
rocprim
::
max
(
sizeof
(
Key
)
+
sizeof
(
unsigned
int
),
sizeof
(
Value
)),
ROCPRIM_WARP_SIZE_64
>::
value
>>
;
};
template
<
class
Key
,
class
Value
>
struct
merge_sort_config_900
<
Key
,
Value
,
false
>
{
using
type
=
select_type
<
// clang-format off
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
16
),
merge_sort_config
<
512U
,
512U
,
2U
>>
,
select_type_case
<
(
sizeof
(
Key
)
==
16
&&
sizeof
(
Value
)
<=
16
),
merge_sort_config
<
512U
,
512U
,
2U
>>
,
// clang-format on
merge_sort_config
<
limit_block_size
<
512U
,
::
rocprim
::
max
(
sizeof
(
Key
)
+
sizeof
(
unsigned
int
),
sizeof
(
Value
)),
ROCPRIM_WARP_SIZE_64
>::
value
>>
;
};
// TODO: We need to update these parameters
template
<
class
Key
,
class
Value
>
struct
merge_sort_config_1030
{
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
merge_sort_config
<
64U
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
merge_sort_config
<
256U
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
merge_sort_config
<
512U
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
merge_sort_config
<
1024U
>
>
,
merge_sort_config
<
limit_block_size
<
1024U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
>
>
;
};
template
<
class
Value
>
struct
merge_sort_config_1030
<
rocprim
::
half
,
Value
>
{
using
type
=
merge_sort_config
<
limit_block_size
<
256U
,
sizeof
(
rocprim
::
half
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_32
>::
value
>
;
};
template
<
class
Key
>
struct
merge_sort_config_1030
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
merge_sort_config
<
64U
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
merge_sort_config
<
256U
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
merge_sort_config
<
256U
>
>
,
select_type_case
<
sizeof
(
Key
)
>=
8
,
merge_sort_config
<
limit_block_size
<
512U
,
sizeof
(
Key
),
ROCPRIM_WARP_SIZE_32
>::
value
>
>
>
{
};
template
<
>
struct
merge_sort_config_1030
<
rocprim
::
half
,
empty_type
>
{
using
type
=
merge_sort_config
<
256U
>
;
};
template
<
unsigned
int
TargetArch
,
class
Key
,
class
Value
>
struct
default_merge_sort_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
merge_sort_config_803
<
Key
,
Value
>>
,
select_arch_case
<
900
,
merge_sort_config_900
<
Key
,
Value
>>
,
select_arch_case
<
1030
,
merge_sort_config_1030
<
Key
,
Value
>>
,
merge_sort_config_900
<
Key
,
Value
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_MERGE_SORT_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_partition.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_
#define ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_
#include <algorithm>
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "../type_traits.hpp"
#include "../detail/various.hpp"
#include "device_select_config.hpp"
#include "detail/device_scan_common.hpp"
#include "detail/device_partition.hpp"
#include "device_transform.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
template
<
select_method
SelectMethod
,
bool
OnlySelected
,
class
Config
,
class
KeyIterator
,
class
ValueIterator
,
class
FlagIterator
,
class
OutputKeyIterator
,
class
OutputValueIterator
,
class
InequalityOp
,
class
OffsetLookbackScanState
,
class
...
UnaryPredicates
>
ROCPRIM_KERNEL
__launch_bounds__
(
Config
::
block_size
)
void
partition_kernel
(
KeyIterator
keys_input
,
ValueIterator
values_input
,
FlagIterator
flags
,
OutputKeyIterator
keys_output
,
OutputValueIterator
values_output
,
size_t
*
selected_count
,
size_t
*
prev_selected_count
,
const
size_t
size
,
InequalityOp
inequality_op
,
OffsetLookbackScanState
offset_scan_state
,
const
unsigned
int
number_of_blocks
,
ordered_block_id
<
unsigned
int
>
ordered_bid
,
UnaryPredicates
...
predicates
)
{
partition_kernel_impl
<
SelectMethod
,
OnlySelected
,
Config
>
(
keys_input
,
values_input
,
flags
,
keys_output
,
values_output
,
selected_count
,
prev_selected_count
,
size
,
inequality_op
,
offset_scan_state
,
number_of_blocks
,
ordered_bid
,
predicates
...
);
}
#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template
<
// Method of selection: flag, predicate, unique
select_method
SelectMethod
,
// if true, it doesn't copy rejected values to output
bool
OnlySelected
,
class
Config
,
class
OffsetT
,
class
KeyIterator
,
class
ValueIterator
,
// can be rocprim::empty_type* for key only
class
FlagIterator
,
class
OutputKeyIterator
,
class
OutputValueIterator
,
// can be rocprim::empty_type* for key only
class
InequalityOp
,
class
SelectedCountOutputIterator
,
class
...
UnaryPredicates
>
inline
cudaError_t
partition_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeyIterator
keys_input
,
ValueIterator
values_input
,
FlagIterator
flags
,
OutputKeyIterator
keys_output
,
OutputValueIterator
values_output
,
SelectedCountOutputIterator
selected_count_output
,
const
size_t
size
,
InequalityOp
inequality_op
,
const
cudaStream_t
stream
,
bool
debug_synchronous
,
UnaryPredicates
...
predicates
)
{
using
offset_type
=
OffsetT
;
using
key_type
=
typename
std
::
iterator_traits
<
KeyIterator
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValueIterator
>::
value_type
;
// Get default config if Config is default_config
using
config
=
default_or_custom_config
<
Config
,
default_select_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
value_type
>
>
;
using
offset_scan_state_type
=
detail
::
lookback_scan_state
<
offset_type
>
;
using
offset_scan_state_with_sleep_type
=
detail
::
lookback_scan_state
<
offset_type
,
true
>
;
using
ordered_block_id_type
=
detail
::
ordered_block_id
<
unsigned
int
>
;
static
constexpr
unsigned
int
block_size
=
config
::
block_size
;
static
constexpr
unsigned
int
items_per_thread
=
config
::
items_per_thread
;
static
constexpr
auto
items_per_block
=
block_size
*
items_per_thread
;
static
constexpr
bool
is_three_way
=
sizeof
...(
UnaryPredicates
)
==
2
;
static
constexpr
size_t
size_limit
=
config
::
size_limit
;
static
constexpr
size_t
aligned_size_limit
=
::
rocprim
::
max
<
size_t
>
(
size_limit
-
(
size_limit
%
items_per_block
),
items_per_block
);
const
size_t
limited_size
=
std
::
min
<
size_t
>
(
size
,
aligned_size_limit
);
const
bool
use_limited_size
=
limited_size
==
aligned_size_limit
;
const
unsigned
int
number_of_blocks
=
static_cast
<
unsigned
int
>
(
::
rocprim
::
detail
::
ceiling_div
(
limited_size
,
items_per_block
));
// Calculate required temporary storage
size_t
offset_scan_state_bytes
=
::
rocprim
::
detail
::
align_size
(
// This is valid even with offset_scan_state_with_sleep_type
offset_scan_state_type
::
get_storage_size
(
number_of_blocks
)
);
size_t
ordered_block_id_bytes
=
::
rocprim
::
detail
::
align_size
(
ordered_block_id_type
::
get_storage_size
(),
alignof
(
size_t
)
);
if
(
temporary_storage
==
nullptr
)
{
// storage_size is never zero
storage_size
=
offset_scan_state_bytes
+
ordered_block_id_bytes
+
(
sizeof
(
size_t
)
*
2
*
(
is_three_way
?
2
:
1
));
return
cudaSuccess
;
}
// Start point for time measurements
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
// Create and initialize lookback_scan_state obj
auto
offset_scan_state
=
offset_scan_state_type
::
create
(
temporary_storage
,
number_of_blocks
);
auto
offset_scan_state_with_sleep
=
offset_scan_state_with_sleep_type
::
create
(
temporary_storage
,
number_of_blocks
);
// Create ad initialize ordered_block_id obj
auto
ptr
=
reinterpret_cast
<
char
*>
(
temporary_storage
);
auto
ordered_bid
=
ordered_block_id_type
::
create
(
reinterpret_cast
<
ordered_block_id_type
::
id_type
*>
(
ptr
+
offset_scan_state_bytes
)
);
size_t
*
selected_count
=
reinterpret_cast
<
size_t
*>
(
ptr
+
offset_scan_state_bytes
+
ordered_block_id_bytes
);
size_t
*
prev_selected_count
=
reinterpret_cast
<
size_t
*>
(
ptr
+
offset_scan_state_bytes
+
ordered_block_id_bytes
+
(
is_three_way
?
2
:
1
)
*
sizeof
(
size_t
));
cudaError_t
error
;
// Memset selected_count and prev_selected_count at once
error
=
cudaMemsetAsync
(
selected_count
,
0
,
sizeof
(
*
selected_count
)
*
2
*
(
is_three_way
?
2
:
1
),
stream
);
if
(
error
!=
cudaSuccess
)
return
error
;
cudaDeviceProp
prop
;
int
deviceId
;
static_cast
<
void
>
(
cudaGetDevice
(
&
deviceId
));
static_cast
<
void
>
(
cudaGetDeviceProperties
(
&
prop
,
deviceId
));
int
asicRevision
=
0
;
const
size_t
number_of_launches
=
::
rocprim
::
detail
::
ceiling_div
(
size
,
aligned_size_limit
);
if
(
debug_synchronous
)
{
std
::
cout
<<
"use_limited_size "
<<
use_limited_size
<<
'\n'
;
std
::
cout
<<
"aligned_size_limit "
<<
aligned_size_limit
<<
'\n'
;
std
::
cout
<<
"number_of_launches "
<<
number_of_launches
<<
'\n'
;
std
::
cout
<<
"size "
<<
size
<<
'\n'
;
std
::
cout
<<
"block_size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"items_per_block "
<<
items_per_block
<<
'\n'
;
}
for
(
size_t
i
=
0
,
offset
=
0
;
i
<
number_of_launches
;
i
++
,
offset
+=
limited_size
)
{
const
unsigned
int
current_size
=
static_cast
<
unsigned
int
>
(
std
::
min
<
size_t
>
(
size
-
offset
,
limited_size
));
const
unsigned
int
current_number_of_blocks
=
::
rocprim
::
detail
::
ceiling_div
(
current_size
,
items_per_block
);
auto
grid_size
=
::
rocprim
::
detail
::
ceiling_div
(
number_of_blocks
,
block_size
);
if
(
debug_synchronous
)
{
std
::
cout
<<
"current size "
<<
current_size
<<
'\n'
;
std
::
cout
<<
"current number of blocks "
<<
current_number_of_blocks
<<
'\n'
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
init_lookback_scan_state_kernel
<
offset_scan_state_type
>
<<<
dim3
(
grid_size
),
dim3
(
block_size
),
0
,
stream
>>>
(
offset_scan_state
,
current_number_of_blocks
,
ordered_bid
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"init_offset_scan_state_kernel"
,
current_number_of_blocks
,
start
)
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
grid_size
=
current_number_of_blocks
;
partition_kernel
<
SelectMethod
,
OnlySelected
,
config
>
<<<
dim3
(
grid_size
),
dim3
(
block_size
),
0
,
stream
>>>
(
keys_input
+
offset
,
values_input
+
offset
,
flags
+
offset
,
keys_output
,
values_output
,
selected_count
,
prev_selected_count
,
current_size
,
inequality_op
,
offset_scan_state
,
current_number_of_blocks
,
ordered_bid
,
predicates
...
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"partition_kernel"
,
size
,
start
)
std
::
swap
(
selected_count
,
prev_selected_count
);
}
error
=
::
rocprim
::
transform
(
prev_selected_count
,
selected_count_output
,
(
is_three_way
?
2
:
1
),
::
rocprim
::
identity
<>
{},
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#undef ROCPRIM_DETAIL_HIP_SYNC
}
// end of detail namespace
/// \brief Parallel select primitive for device level using range of flags.
///
/// Performs a device-wide partition based on input \p flags. Partition copies
/// the values from \p input to \p output in such a way that all values for which the corresponding
/// items from /p flags are \p true (or can be implicitly converted to \p true) precede
/// the elements for which the corresponding items from /p flags are \p false.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input, \p flags and \p output must have at least \p size elements.
/// * Range specified by \p selected_count_output must have at least 1 element.
/// * Values of \p flag range should be implicitly convertible to `bool` type.
/// * Relative order is preserved for the elements for which the corresponding values from \p flags
/// are \p true. Other elements are copied in reverse order.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam FlagIterator - random-access iterator type of the flag range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [in] flags - iterator to the selection flag corresponding to the first element from \p input range.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level partition operation is performed on an array of
/// integer values with array of <tt>char</tt>s used as flags.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// char * flags; // e.g., [0, 1, 1, 0, 0, 1, 0, 1]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::partition(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, flags,
/// output, output_count,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform partition
/// rocprim::partition(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, flags,
/// output, output_count,
/// input_size
/// );
/// // output: [2, 3, 6, 8, 7, 5, 4, 1]
/// // output_count: 4
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
FlagIterator
,
class
OutputIterator
,
class
SelectedCountOutputIterator
>
inline
cudaError_t
partition
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
FlagIterator
flags
,
OutputIterator
output
,
SelectedCountOutputIterator
selected_count_output
,
const
size_t
size
,
const
cudaStream_t
stream
=
0
,
const
bool
debug_synchronous
=
false
)
{
// Dummy unary predicate
using
unary_predicate_type
=
::
rocprim
::
empty_type
;
// Dummy inequality operation
using
inequality_op_type
=
::
rocprim
::
empty_type
;
using
offset_type
=
unsigned
int
;
rocprim
::
empty_type
*
const
no_values
=
nullptr
;
// key only
return
detail
::
partition_impl
<
detail
::
select_method
::
flag
,
false
,
Config
,
offset_type
>
(
temporary_storage
,
storage_size
,
input
,
no_values
,
flags
,
output
,
no_values
,
selected_count_output
,
size
,
inequality_op_type
(),
stream
,
debug_synchronous
,
unary_predicate_type
()
);
}
/// \brief Parallel select primitive for device level using selection predicate.
///
/// Performs a device-wide partition using selection predicate. Partition copies
/// the values from \p input to \p output in such a way that all values for which
/// the \p predicate returns \p true precede the elements for which it returns \p false.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input, \p flags and \p output must have at least \p size elements.
/// * Range specified by \p selected_count_output must have at least 1 element.
/// * Relative order is preserved for the elements for which the \p predicate returns \p true. Other
/// elements are copied in reverse order.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
/// \tparam UnaryPredicate - type of a unary selection predicate.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] predicate - unary function object which returns /p true if the element should be
/// ordered before other elements.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level partition operation is performed on an array of
/// integer values, even values are copied before odd values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>///
///
/// auto predicate =
/// [] __device__ (int a) -> bool
/// {
/// return (a%2) == 0;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::partition(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input,
/// output, output_count,
/// input_size,
/// predicate
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform partition
/// rocprim::partition(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input,
/// output, output_count,
/// input_size,
/// predicate
/// );
/// // output: [2, 4, 6, 8, 7, 5, 3, 1]
/// // output_count: 4
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
SelectedCountOutputIterator
,
class
UnaryPredicate
>
inline
cudaError_t
partition
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
SelectedCountOutputIterator
selected_count_output
,
const
size_t
size
,
UnaryPredicate
predicate
,
const
cudaStream_t
stream
=
0
,
const
bool
debug_synchronous
=
false
)
{
// Dummy flag type
using
flag_type
=
::
rocprim
::
empty_type
;
flag_type
*
flags
=
nullptr
;
// Dummy inequality operation
using
inequality_op_type
=
::
rocprim
::
empty_type
;
using
offset_type
=
unsigned
int
;
rocprim
::
empty_type
*
const
no_values
=
nullptr
;
// key only
return
detail
::
partition_impl
<
detail
::
select_method
::
predicate
,
false
,
Config
,
offset_type
>
(
temporary_storage
,
storage_size
,
input
,
no_values
,
flags
,
output
,
no_values
,
selected_count_output
,
size
,
inequality_op_type
(),
stream
,
debug_synchronous
,
predicate
);
}
/// \brief Parallel select primitive for device level using two selection predicates.
///
/// Performs a device-wide three-way partition using two selection predicates. Partition copies
/// the values from \p input to either \p output_first_part or \p output_second_part or
/// \p output_unselected according to the following criteria:
/// The value is copied to \p output_first_part if the predicate \p select_first_part_op invoked
/// with the value returns \p true. It is copied to \p output_second_part if \p select_first_part_op
/// returns \p false and \p select_second_part_op returns \p true, and it is copied to
/// \p output_unselected otherwise.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage is a null pointer.
/// * Range specified by \p selected_count_output must have at least 2 elements.
/// * Relative order is preserved for the elements.
/// * The number of elements written to \p output_first_part is equal to the number of elements
/// in the input for which \p select_first_part_op returned \p true.
/// * The number of elements written to \p output_second_part is equal to the number of elements
/// in the input for which \p select_first_part_op returned \p false and \p select_second_part_op
/// returned \p true.
/// * The number of elements written to \p output_unselected is equal to the number of input elements
/// minus the number of elements written to \p output_first_part minus the number of elements written
/// to \p output_second_part.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam FirstOutputIterator - random-access iterator type of the first output range. It can be
/// a simple pointer type.
/// \tparam SecondOutputIterator - random-access iterator type of the second output range. It can be
/// a simple pointer type.
/// \tparam UnselectedOutputIterator - random-access iterator type of the unselected output range.
/// It can be a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
/// \tparam FirstUnaryPredicate - type of the first unary selection predicate.
/// \tparam SecondUnaryPredicate - type of the second unary selection predicate.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [out] output_first_part - iterator to the first element in the first output range.
/// \param [out] output_second_part - iterator to the first element in the second output range.
/// \param [out] output_unselected - iterator to the first element in the unselected output range.
/// \param [out] selected_count_output - iterator to the total number of selected values in
/// \p output_first_part and \p output_second_part respectively.
/// \param [in] size - number of element in the input range.
/// \param [in] select_first_part_op - unary function object which returns \p true if the element
/// should be in \p output_first_part range
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] select_second_part_op - unary function object which returns \p true if the element
/// should be in \p output_second_part range (given that \p select_first_part_op returned \p false)
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level three-way partition operation is performed on an array of
/// integer values, even values are copied to the first partition, odd and 3-divisible values
/// are copied to the second partition, and the rest of the values are copied to the
/// unselected partition
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// auto first_predicate =
/// [] __device__ (int a) -> bool
/// {
/// return (a%2) == 0;
/// };
/// auto second_predicate =
/// [] __device__ (int a) -> bool
/// {
/// return (a%3) == 0;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output_first_part; // array of 8 elements
/// int * output_second_part; // array of 8 elements
/// int * output_unselected; // array of 8 elements
/// size_t * output_count; // array of 2 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::partition_three_way(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input,
/// output_first_part, output_second_part, output_unselected,
/// output_count,
/// input_size,
/// first_predicate,
/// second_predicate
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform partition
/// rocprim::partition_three_way(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input,
/// output_first_part, output_second_part, output_unselected,
/// output_count,
/// input_size,
/// first_predicate,
/// second_predicate
/// );
/// // elements denoted by '*' were not modified
/// // output_first_part: [2, 4, 6, 8, *, *, *, *]
/// // output_second_part: [3, *, *, *, *, *, *, *]
/// // output_unselected: [1, 5, 7, *, *, *, *, *]
/// // output_count: [4, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
typename
InputIterator
,
typename
FirstOutputIterator
,
typename
SecondOutputIterator
,
typename
UnselectedOutputIterator
,
typename
SelectedCountOutputIterator
,
typename
FirstUnaryPredicate
,
typename
SecondUnaryPredicate
>
inline
cudaError_t
partition_three_way
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
FirstOutputIterator
output_first_part
,
SecondOutputIterator
output_second_part
,
UnselectedOutputIterator
output_unselected
,
SelectedCountOutputIterator
selected_count_output
,
const
size_t
size
,
FirstUnaryPredicate
select_first_part_op
,
SecondUnaryPredicate
select_second_part_op
,
const
cudaStream_t
stream
=
0
,
const
bool
debug_synchronous
=
false
)
{
// Dummy flag type
using
flag_type
=
::
rocprim
::
empty_type
;
flag_type
*
flags
=
nullptr
;
// Dummy inequality operation
using
inequality_op_type
=
::
rocprim
::
empty_type
;
using
offset_type
=
uint2
;
using
output_key_iterator_tuple
=
tuple
<
FirstOutputIterator
,
SecondOutputIterator
,
UnselectedOutputIterator
>
;
using
output_value_iterator_tuple
=
tuple
<::
rocprim
::
empty_type
*
,
::
rocprim
::
empty_type
*
,
::
rocprim
::
empty_type
*>
;
rocprim
::
empty_type
*
const
no_input_values
=
nullptr
;
// key only
const
output_value_iterator_tuple
no_output_values
{
nullptr
,
nullptr
,
nullptr
};
// key only
output_key_iterator_tuple
output
{
output_first_part
,
output_second_part
,
output_unselected
};
return
detail
::
partition_impl
<
detail
::
select_method
::
predicate
,
false
,
Config
,
offset_type
>
(
temporary_storage
,
storage_size
,
input
,
no_input_values
,
flags
,
output
,
no_output_values
,
selected_count_output
,
size
,
inequality_op_type
(),
stream
,
debug_synchronous
,
select_first_part_op
,
select_second_part_op
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_PARTITION_HPP_
3rdparty/cub/rocprim/device/device_radix_sort.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
#define ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
#include <iostream>
#include <iterator>
#include <type_traits>
#include <utility>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/radix_sort.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "device_radix_sort_config.hpp"
#include "device_transform.hpp"
#include "detail/device_radix_sort.hpp"
#include "specialization/device_radix_single_sort.hpp"
#include "specialization/device_radix_merge_sort.hpp"
/// \addtogroup devicemodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
unsigned
int
RadixBits
,
bool
Descending
,
class
KeysInputIterator
,
class
Offset
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
fill_digit_counts_kernel
(
KeysInputIterator
keys_input
,
Offset
size
,
Offset
*
batch_digit_counts
,
unsigned
int
bit
,
unsigned
int
current_radix_bits
,
unsigned
int
blocks_per_full_batch
,
unsigned
int
full_batches
)
{
fill_digit_counts
<
BlockSize
,
ItemsPerThread
,
RadixBits
,
Descending
>
(
keys_input
,
size
,
batch_digit_counts
,
bit
,
current_radix_bits
,
blocks_per_full_batch
,
full_batches
);
}
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
unsigned
int
RadixBits
,
class
Offset
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
scan_batches_kernel
(
Offset
*
batch_digit_counts
,
Offset
*
digit_counts
,
unsigned
int
batches
)
{
scan_batches
<
BlockSize
,
ItemsPerThread
,
RadixBits
>
(
batch_digit_counts
,
digit_counts
,
batches
);
}
template
<
unsigned
int
RadixBits
,
class
Offset
>
ROCPRIM_KERNEL
__launch_bounds__
(
ROCPRIM_DEFAULT_MAX_BLOCK_SIZE
)
void
scan_digits_kernel
(
Offset
*
digit_counts
)
{
scan_digits
<
RadixBits
>
(
digit_counts
);
}
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
unsigned
int
RadixBits
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
Offset
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
sort_and_scatter_kernel
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
Offset
size
,
const
Offset
*
batch_digit_starts
,
const
Offset
*
digit_starts
,
unsigned
int
bit
,
unsigned
int
current_radix_bits
,
unsigned
int
blocks_per_full_batch
,
unsigned
int
full_batches
)
{
sort_and_scatter
<
BlockSize
,
ItemsPerThread
,
RadixBits
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
batch_digit_starts
,
digit_starts
,
bit
,
current_radix_bits
,
blocks_per_full_batch
,
full_batches
);
}
#ifndef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
#endif
template
<
class
Config
,
unsigned
int
RadixBits
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
Offset
>
inline
cudaError_t
radix_sort_iteration
(
KeysInputIterator
keys_input
,
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
*
keys_tmp
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
*
values_tmp
,
ValuesOutputIterator
values_output
,
Offset
size
,
Offset
*
batch_digit_counts
,
Offset
*
digit_counts
,
bool
from_input
,
bool
to_output
,
unsigned
int
bit
,
unsigned
int
end_bit
,
unsigned
int
blocks_per_full_batch
,
unsigned
int
full_batches
,
unsigned
int
batches
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
constexpr
unsigned
int
radix_size
=
1
<<
RadixBits
;
// Handle cases when (end_bit - bit) is not divisible by RadixBits, i.e. the last
// iteration has a shorter mask.
const
unsigned
int
current_radix_bits
=
::
rocprim
::
min
(
RadixBits
,
end_bit
-
bit
);
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"RadixBits "
<<
RadixBits
<<
'\n'
;
std
::
cout
<<
"bit "
<<
bit
<<
'\n'
;
std
::
cout
<<
"current_radix_bits "
<<
current_radix_bits
<<
'\n'
;
}
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
if
(
from_input
)
{
fill_digit_counts_kernel
<
Config
::
sort
::
block_size
,
Config
::
sort
::
items_per_thread
,
RadixBits
,
Descending
>
<<<
dim3
(
batches
),
dim3
(
Config
::
sort
::
block_size
),
0
,
stream
>>>
(
keys_input
,
size
,
batch_digit_counts
,
bit
,
current_radix_bits
,
blocks_per_full_batch
,
full_batches
);
}
else
{
if
(
to_output
)
{
fill_digit_counts_kernel
<
Config
::
sort
::
block_size
,
Config
::
sort
::
items_per_thread
,
RadixBits
,
Descending
>
<<<
dim3
(
batches
),
dim3
(
Config
::
sort
::
block_size
),
0
,
stream
>>>
(
keys_tmp
,
size
,
batch_digit_counts
,
bit
,
current_radix_bits
,
blocks_per_full_batch
,
full_batches
);
}
else
{
fill_digit_counts_kernel
<
Config
::
sort
::
block_size
,
Config
::
sort
::
items_per_thread
,
RadixBits
,
Descending
>
<<<
dim3
(
batches
),
dim3
(
Config
::
sort
::
block_size
),
0
,
stream
>>>
(
keys_output
,
size
,
batch_digit_counts
,
bit
,
current_radix_bits
,
blocks_per_full_batch
,
full_batches
);
}
}
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"fill_digit_counts"
,
size
,
start
)
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
scan_batches_kernel
<
Config
::
scan
::
block_size
,
Config
::
scan
::
items_per_thread
,
RadixBits
>
<<<
dim3
(
radix_size
),
dim3
(
Config
::
scan
::
block_size
),
0
,
stream
>>>
(
batch_digit_counts
,
digit_counts
,
batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"scan_batches"
,
radix_size
*
Config
::
scan
::
block_size
,
start
)
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
scan_digits_kernel
<
RadixBits
>
<<<
dim3
(
1
),
dim3
(
radix_size
),
0
,
stream
>>>
(
digit_counts
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"scan_digits"
,
radix_size
,
start
)
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
if
(
from_input
)
{
if
(
to_output
)
{
sort_and_scatter_kernel
<
Config
::
sort
::
block_size
,
Config
::
sort
::
items_per_thread
,
RadixBits
,
Descending
>
<<<
dim3
(
batches
),
dim3
(
Config
::
sort
::
block_size
),
0
,
stream
>>>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
const_cast
<
const
Offset
*>
(
batch_digit_counts
),
const_cast
<
const
Offset
*>
(
digit_counts
),
bit
,
current_radix_bits
,
blocks_per_full_batch
,
full_batches
);
}
else
{
sort_and_scatter_kernel
<
Config
::
sort
::
block_size
,
Config
::
sort
::
items_per_thread
,
RadixBits
,
Descending
>
<<<
dim3
(
batches
),
dim3
(
Config
::
sort
::
block_size
),
0
,
stream
>>>
(
keys_input
,
keys_tmp
,
values_input
,
values_tmp
,
size
,
const_cast
<
const
Offset
*>
(
batch_digit_counts
),
const_cast
<
const
Offset
*>
(
digit_counts
),
bit
,
current_radix_bits
,
blocks_per_full_batch
,
full_batches
);
}
}
else
{
if
(
to_output
)
{
sort_and_scatter_kernel
<
Config
::
sort
::
block_size
,
Config
::
sort
::
items_per_thread
,
RadixBits
,
Descending
>
<<<
dim3
(
batches
),
dim3
(
Config
::
sort
::
block_size
),
0
,
stream
>>>
(
keys_tmp
,
keys_output
,
values_tmp
,
values_output
,
size
,
const_cast
<
const
Offset
*>
(
batch_digit_counts
),
const_cast
<
const
Offset
*>
(
digit_counts
),
bit
,
current_radix_bits
,
blocks_per_full_batch
,
full_batches
);
}
else
{
sort_and_scatter_kernel
<
Config
::
sort
::
block_size
,
Config
::
sort
::
items_per_thread
,
RadixBits
,
Descending
>
<<<
dim3
(
batches
),
dim3
(
Config
::
sort
::
block_size
),
0
,
stream
>>>
(
keys_output
,
keys_tmp
,
values_output
,
values_tmp
,
size
,
const_cast
<
const
Offset
*>
(
batch_digit_counts
),
const_cast
<
const
Offset
*>
(
digit_counts
),
bit
,
current_radix_bits
,
blocks_per_full_batch
,
full_batches
);
}
}
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"sort_and_scatter"
,
size
,
start
)
return
cudaSuccess
;
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
bool
&
is_result_in_output
,
unsigned
int
begin_bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
;
using
config
=
default_or_custom_config
<
Config
,
default_radix_sort_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
value_type
>
>
;
const
size_t
minimum_bytes
=
::
rocprim
::
detail
::
align_size
(
1
);
if
(
temporary_storage
==
nullptr
)
{
storage_size
=
minimum_bytes
;
return
cudaSuccess
;
}
if
(
size
==
0u
)
return
cudaSuccess
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"temporary_storage "
<<
temporary_storage
<<
'\n'
;
cudaError_t
error
=
cudaStreamSynchronize
(
stream
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
cudaError_t
error
=
radix_sort_single
<
config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
is_result_in_output
=
true
;
return
cudaSuccess
;
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_merge_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
*
keys_tmp
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
*
values_tmp
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
bool
&
is_result_in_output
,
unsigned
int
begin_bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
;
using
config
=
default_or_custom_config
<
Config
,
default_radix_sort_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
value_type
>
>
;
constexpr
bool
with_values
=
!
std
::
is_same
<
value_type
,
::
rocprim
::
empty_type
>::
value
;
const
bool
with_double_buffer
=
keys_tmp
!=
nullptr
;
const
size_t
keys_bytes
=
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
key_type
));
const
size_t
values_bytes
=
with_values
?
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
value_type
))
:
0
;
const
size_t
minimum_bytes
=
::
rocprim
::
detail
::
align_size
(
1
);
if
(
temporary_storage
==
nullptr
)
{
if
(
!
with_double_buffer
)
storage_size
=
keys_bytes
+
values_bytes
;
else
storage_size
=
minimum_bytes
;
return
cudaSuccess
;
}
if
(
debug_synchronous
)
{
std
::
cout
<<
"temporary_storage "
<<
temporary_storage
<<
'\n'
;
cudaError_t
error
=
cudaStreamSynchronize
(
stream
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
if
(
!
with_double_buffer
)
{
char
*
ptr
=
reinterpret_cast
<
char
*>
(
temporary_storage
);
keys_tmp
=
reinterpret_cast
<
key_type
*>
(
ptr
);
ptr
+=
keys_bytes
;
values_tmp
=
with_values
?
reinterpret_cast
<
value_type
*>
(
ptr
)
:
nullptr
;
}
cudaError_t
error
=
radix_sort_merge
<
config
,
Descending
>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
size
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
is_result_in_output
=
true
;
return
cudaSuccess
;
}
template
<
class
Size
>
using
offset_type_t
=
std
::
conditional_t
<
sizeof
(
Size
)
<=
4
,
unsigned
int
,
size_t
>
;
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
Size
>
inline
cudaError_t
radix_sort_iterations_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
*
keys_tmp
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
*
values_tmp
,
ValuesOutputIterator
values_output
,
Size
size
,
bool
&
is_result_in_output
,
unsigned
int
begin_bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
;
using
offset_type
=
offset_type_t
<
Size
>
;
using
config
=
default_or_custom_config
<
Config
,
default_radix_sort_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
value_type
>
>
;
constexpr
bool
with_values
=
!
std
::
is_same
<
value_type
,
::
rocprim
::
empty_type
>::
value
;
constexpr
unsigned
int
max_radix_size
=
1
<<
config
::
long_radix_bits
;
constexpr
unsigned
int
scan_size
=
config
::
scan
::
block_size
*
config
::
scan
::
items_per_thread
;
constexpr
unsigned
int
sort_size
=
config
::
sort
::
block_size
*
config
::
sort
::
items_per_thread
;
const
unsigned
int
blocks
=
static_cast
<
unsigned
int
>
(
::
rocprim
::
detail
::
ceiling_div
(
size
,
sort_size
));
const
unsigned
int
blocks_per_full_batch
=
::
rocprim
::
detail
::
ceiling_div
(
blocks
,
scan_size
);
const
unsigned
int
full_batches
=
blocks
%
scan_size
!=
0
?
blocks
%
scan_size
:
scan_size
;
const
unsigned
int
batches
=
(
blocks_per_full_batch
==
1
?
full_batches
:
scan_size
);
const
bool
with_double_buffer
=
keys_tmp
!=
nullptr
;
const
unsigned
int
bits
=
end_bit
-
begin_bit
;
const
unsigned
int
iterations
=
::
rocprim
::
detail
::
ceiling_div
(
bits
,
config
::
long_radix_bits
);
const
unsigned
int
radix_bits_diff
=
config
::
long_radix_bits
-
config
::
short_radix_bits
;
const
unsigned
int
short_iterations
=
radix_bits_diff
!=
0
?
::
rocprim
::
min
(
iterations
,
(
config
::
long_radix_bits
*
iterations
-
bits
)
/
std
::
max
(
1u
,
radix_bits_diff
))
:
0
;
const
unsigned
int
long_iterations
=
iterations
-
short_iterations
;
const
size_t
batch_digit_counts_bytes
=
::
rocprim
::
detail
::
align_size
(
batches
*
max_radix_size
*
sizeof
(
offset_type
));
const
size_t
digit_counts_bytes
=
::
rocprim
::
detail
::
align_size
(
max_radix_size
*
sizeof
(
offset_type
));
const
size_t
keys_bytes
=
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
key_type
));
const
size_t
values_bytes
=
with_values
?
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
value_type
))
:
0
;
if
(
temporary_storage
==
nullptr
)
{
storage_size
=
batch_digit_counts_bytes
+
digit_counts_bytes
;
if
(
!
with_double_buffer
)
{
storage_size
+=
keys_bytes
+
values_bytes
;
}
return
cudaSuccess
;
}
if
(
size
==
0u
)
return
cudaSuccess
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"scan_size "
<<
scan_size
<<
'\n'
;
std
::
cout
<<
"sort_size "
<<
sort_size
<<
'\n'
;
std
::
cout
<<
"blocks "
<<
blocks
<<
'\n'
;
std
::
cout
<<
"blocks_per_full_batch "
<<
blocks_per_full_batch
<<
'\n'
;
std
::
cout
<<
"full_batches "
<<
full_batches
<<
'\n'
;
std
::
cout
<<
"batches "
<<
batches
<<
'\n'
;
std
::
cout
<<
"iterations "
<<
iterations
<<
'\n'
;
std
::
cout
<<
"long_iterations "
<<
long_iterations
<<
'\n'
;
std
::
cout
<<
"short_iterations "
<<
short_iterations
<<
'\n'
;
cudaError_t
error
=
cudaStreamSynchronize
(
stream
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
char
*
ptr
=
reinterpret_cast
<
char
*>
(
temporary_storage
);
offset_type
*
batch_digit_counts
=
reinterpret_cast
<
offset_type
*>
(
ptr
);
ptr
+=
batch_digit_counts_bytes
;
offset_type
*
digit_counts
=
reinterpret_cast
<
offset_type
*>
(
ptr
);
ptr
+=
digit_counts_bytes
;
if
(
!
with_double_buffer
)
{
keys_tmp
=
reinterpret_cast
<
key_type
*>
(
ptr
);
ptr
+=
keys_bytes
;
values_tmp
=
with_values
?
reinterpret_cast
<
value_type
*>
(
ptr
)
:
nullptr
;
}
bool
to_output
=
with_double_buffer
||
(
iterations
-
1
)
%
2
==
0
;
bool
from_input
=
true
;
if
(
!
with_double_buffer
&&
to_output
)
{
// Copy input keys and values if necessary (in-place sorting: input and output iterators are equal)
const
bool
keys_equal
=
::
rocprim
::
detail
::
are_iterators_equal
(
keys_input
,
keys_output
);
const
bool
values_equal
=
with_values
&&
::
rocprim
::
detail
::
are_iterators_equal
(
values_input
,
values_output
);
if
(
keys_equal
||
values_equal
)
{
cudaError_t
error
=
::
rocprim
::
transform
(
keys_input
,
keys_tmp
,
size
,
::
rocprim
::
identity
<
key_type
>
(),
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
if
(
with_values
)
{
cudaError_t
error
=
::
rocprim
::
transform
(
values_input
,
values_tmp
,
size
,
::
rocprim
::
identity
<
value_type
>
(),
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
from_input
=
false
;
}
}
unsigned
int
bit
=
begin_bit
;
for
(
unsigned
int
i
=
0
;
i
<
long_iterations
;
i
++
)
{
cudaError_t
error
=
radix_sort_iteration
<
config
,
config
::
long_radix_bits
,
Descending
>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
static_cast
<
offset_type
>
(
size
),
batch_digit_counts
,
digit_counts
,
from_input
,
to_output
,
bit
,
end_bit
,
blocks_per_full_batch
,
full_batches
,
batches
,
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
is_result_in_output
=
to_output
;
from_input
=
false
;
to_output
=
!
to_output
;
bit
+=
config
::
long_radix_bits
;
}
for
(
unsigned
int
i
=
0
;
i
<
short_iterations
;
i
++
)
{
cudaError_t
error
=
radix_sort_iteration
<
config
,
config
::
short_radix_bits
,
Descending
>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
static_cast
<
offset_type
>
(
size
),
batch_digit_counts
,
digit_counts
,
from_input
,
to_output
,
bit
,
end_bit
,
blocks_per_full_batch
,
full_batches
,
batches
,
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
is_result_in_output
=
to_output
;
from_input
=
false
;
to_output
=
!
to_output
;
bit
+=
config
::
short_radix_bits
;
}
return
cudaSuccess
;
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
Size
>
inline
cudaError_t
radix_sort_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
*
keys_tmp
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
*
values_tmp
,
ValuesOutputIterator
values_output
,
Size
size
,
bool
&
is_result_in_output
,
unsigned
int
begin_bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
;
static_assert
(
std
::
is_same
<
key_type
,
typename
std
::
iterator_traits
<
KeysOutputIterator
>::
value_type
>::
value
,
"KeysInputIterator and KeysOutputIterator must have the same value_type"
);
static_assert
(
std
::
is_same
<
value_type
,
typename
std
::
iterator_traits
<
ValuesOutputIterator
>::
value_type
>::
value
,
"ValuesInputIterator and ValuesOutputIterator must have the same value_type"
);
using
config
=
default_or_custom_config
<
Config
,
default_radix_sort_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
value_type
>
>
;
constexpr
unsigned
int
single_sort_limit
=
config
::
sort_single
::
block_size
*
config
::
sort_single
::
items_per_thread
;
constexpr
unsigned
int
merge_sort_limit
=
config
::
sort_merge
::
block_size
*
config
::
sort_merge
::
items_per_thread
*
config
::
merge_size_limit_blocks
;
if
(
size
<=
single_sort_limit
)
{
return
radix_sort_single_impl
<
Config
,
Descending
>
(
temporary_storage
,
storage_size
,
keys_input
,
keys_output
,
values_input
,
values_output
,
static_cast
<
unsigned
int
>
(
size
),
is_result_in_output
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
else
if
(
size
<=
merge_sort_limit
)
{
return
radix_sort_merge_impl
<
Config
,
Descending
>
(
temporary_storage
,
storage_size
,
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
static_cast
<
unsigned
int
>
(
size
),
is_result_in_output
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
else
{
return
radix_sort_iterations_impl
<
Config
,
Descending
>
(
temporary_storage
,
storage_size
,
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
size
,
is_result_in_output
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
}
// end namespace detail
/// \brief Parallel ascending radix sort primitive for device level.
///
/// \p radix_sort_keys function performs a device-wide radix sort
/// of keys. Function sorts input keys in ascending order.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
/// // keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
Size
,
class
Key
=
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
inline
cudaError_t
radix_sort_keys
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
Size
size
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
static_assert
(
std
::
is_integral
<
Size
>::
value
,
"Size must be an integral type."
);
empty_type
*
values
=
nullptr
;
bool
ignored
;
return
detail
::
radix_sort_impl
<
Config
,
false
>
(
temporary_storage
,
storage_size
,
keys_input
,
nullptr
,
keys_output
,
values
,
nullptr
,
values
,
size
,
ignored
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
/// \brief Parallel descending radix sort primitive for device level.
///
/// \p radix_sort_keys_desc function performs a device-wide radix sort
/// of keys. Function sorts input keys in descending order.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size
/// );
/// // keys_output: [8, 7, 6, 5, 4, 3, 2, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
Size
,
class
Key
=
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
inline
cudaError_t
radix_sort_keys_desc
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
Size
size
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
static_assert
(
std
::
is_integral
<
Size
>::
value
,
"Size must be an integral type."
);
empty_type
*
values
=
nullptr
;
bool
ignored
;
return
detail
::
radix_sort_impl
<
Config
,
true
>
(
temporary_storage
,
storage_size
,
keys_input
,
nullptr
,
keys_output
,
values
,
nullptr
,
values
,
size
,
ignored
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
/// \brief Parallel ascending radix sort-by-key primitive for device level.
///
/// \p radix_sort_pairs_desc function performs a device-wide radix sort
/// of (key, value) pairs. Function sorts input pairs in ascending order of keys.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
/// have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
///
/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
/// // is set to 5.
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size, 0, 5
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size, 0, 5
/// );
/// // keys_output: [ 1, 1, 3, 4, 5, 6, 7, 8]
/// // values_output: [-1, -2, 2, 3, -4, -5, 7, -8]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
Size
,
class
Key
=
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
inline
cudaError_t
radix_sort_pairs
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
Size
size
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
static_assert
(
std
::
is_integral
<
Size
>::
value
,
"Size must be an integral type."
);
bool
ignored
;
return
detail
::
radix_sort_impl
<
Config
,
false
>
(
temporary_storage
,
storage_size
,
keys_input
,
nullptr
,
keys_output
,
values_input
,
nullptr
,
values_output
,
size
,
ignored
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
/// \brief Parallel descending radix sort-by-key primitive for device level.
///
/// \p radix_sort_pairs_desc function performs a device-wide radix sort
/// of (key, value) pairs. Function sorts input pairs in descending order of keys.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
/// have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed where input keys are
/// represented by an array of integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size
/// );
/// // keys_output: [ 8, 7, 6, 5, 4, 3, 1, 1]
/// // values_output: [-8, 7, -5, -4, 3, 2, -1, -2]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
Size
,
class
Key
=
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
inline
cudaError_t
radix_sort_pairs_desc
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
Size
size
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
static_assert
(
std
::
is_integral
<
Size
>::
value
,
"Size must be an integral type."
);
bool
ignored
;
return
detail
::
radix_sort_impl
<
Config
,
true
>
(
temporary_storage
,
storage_size
,
keys_input
,
nullptr
,
keys_output
,
values_input
,
nullptr
,
values_output
,
size
,
ignored
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
/// \brief Parallel ascending radix sort primitive for device level.
///
/// \p radix_sort_keys function performs a device-wide radix sort
/// of keys. Function sorts input keys in ascending order.
///
/// \par Overview
/// * The contents of both buffers of \p keys may be altered by the sorting function.
/// * \p current() of \p keys is used as the input.
/// * The function will update \p current() of \p keys to point to the buffer
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * tmp; // empty array of 8 elements
/// // Create double-buffer
/// rocprim::double_buffer<float> keys(input, tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size
/// );
/// // keys.current(): [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
Key
,
class
Size
>
inline
cudaError_t
radix_sort_keys
(
void
*
temporary_storage
,
size_t
&
storage_size
,
double_buffer
<
Key
>&
keys
,
Size
size
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
static_assert
(
std
::
is_integral
<
Size
>::
value
,
"Size must be an integral type."
);
empty_type
*
values
=
nullptr
;
bool
is_result_in_output
;
cudaError_t
error
=
detail
::
radix_sort_impl
<
Config
,
false
>
(
temporary_storage
,
storage_size
,
keys
.
current
(),
keys
.
current
(),
keys
.
alternate
(),
values
,
values
,
values
,
size
,
is_result_in_output
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
temporary_storage
!=
nullptr
&&
is_result_in_output
)
{
keys
.
swap
();
}
return
error
;
}
/// \brief Parallel descending radix sort primitive for device level.
///
/// \p radix_sort_keys_desc function performs a device-wide radix sort
/// of keys. Function sorts input keys in descending order.
///
/// \par Overview
/// * The contents of both buffers of \p keys may be altered by the sorting function.
/// * \p current() of \p keys is used as the input.
/// * The function will update \p current() of \p keys to point to the buffer
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
/// int * tmp; // empty array of 8 elements
/// // Create double-buffer
/// rocprim::double_buffer<int> keys(input, tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size
/// );
/// // keys.current(): [8, 7, 6, 5, 4, 3, 2, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
Key
,
class
Size
>
inline
cudaError_t
radix_sort_keys_desc
(
void
*
temporary_storage
,
size_t
&
storage_size
,
double_buffer
<
Key
>&
keys
,
Size
size
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
static_assert
(
std
::
is_integral
<
Size
>::
value
,
"Size must be an integral type."
);
empty_type
*
values
=
nullptr
;
bool
is_result_in_output
;
cudaError_t
error
=
detail
::
radix_sort_impl
<
Config
,
true
>
(
temporary_storage
,
storage_size
,
keys
.
current
(),
keys
.
current
(),
keys
.
alternate
(),
values
,
values
,
values
,
size
,
is_result_in_output
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
temporary_storage
!=
nullptr
&&
is_result_in_output
)
{
keys
.
swap
();
}
return
error
;
}
/// \brief Parallel ascending radix sort-by-key primitive for device level.
///
/// \p radix_sort_pairs_desc function performs a device-wide radix sort
/// of (key, value) pairs. Function sorts input pairs in ascending order of keys.
///
/// \par Overview
/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
/// * \p current() of \p keys and \p values are used as the input.
/// * The function will update \p current() of \p keys and \p values to point to buffers
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Value - value type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in,out] values - reference to the double-buffer of values, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_tmp; // empty array of 8 elements
/// double* values_tmp; // empty array of 8 elements
/// // Create double-buffers
/// rocprim::double_buffer<unsigned int> keys(keys_input, keys_tmp);
/// rocprim::double_buffer<double> values(values_input, values_tmp);
///
/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
/// // is set to 5.
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// 0, 5
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// 0, 5
/// );
/// // keys.current(): [ 1, 1, 3, 4, 5, 6, 7, 8]
/// // values.current(): [-1, -2, 2, 3, -4, -5, 7, -8]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
Key
,
class
Value
,
class
Size
>
inline
cudaError_t
radix_sort_pairs
(
void
*
temporary_storage
,
size_t
&
storage_size
,
double_buffer
<
Key
>&
keys
,
double_buffer
<
Value
>&
values
,
Size
size
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
static_assert
(
std
::
is_integral
<
Size
>::
value
,
"Size must be an integral type."
);
bool
is_result_in_output
;
cudaError_t
error
=
detail
::
radix_sort_impl
<
Config
,
false
>
(
temporary_storage
,
storage_size
,
keys
.
current
(),
keys
.
current
(),
keys
.
alternate
(),
values
.
current
(),
values
.
current
(),
values
.
alternate
(),
size
,
is_result_in_output
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
temporary_storage
!=
nullptr
&&
is_result_in_output
)
{
keys
.
swap
();
values
.
swap
();
}
return
error
;
}
/// \brief Parallel descending radix sort-by-key primitive for device level.
///
/// \p radix_sort_pairs_desc function performs a device-wide radix sort
/// of (key, value) pairs. Function sorts input pairs in descending order of keys.
///
/// \par Overview
/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
/// * \p current() of \p keys and \p values are used as the input.
/// * The function will update \p current() of \p keys and \p values to point to buffers
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
/// a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Value - value type.
/// \tparam Size - integral type that represents the problem size.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in,out] values - reference to the double-buffer of values, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed where input keys are
/// represented by an array of integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// int * keys_tmp; // empty array of 8 elements
/// double * values_tmp; // empty array of 8 elements
/// // Create double-buffers
/// rocprim::double_buffer<int> keys(keys_input, keys_tmp);
/// rocprim::double_buffer<double> values(values_input, values_tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size
/// );
/// // keys.current(): [ 8, 7, 6, 5, 4, 3, 1, 1]
/// // values.current(): [-8, 7, -5, -4, 3, 2, -1, -2]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
Key
,
class
Value
,
class
Size
>
inline
cudaError_t
radix_sort_pairs_desc
(
void
*
temporary_storage
,
size_t
&
storage_size
,
double_buffer
<
Key
>&
keys
,
double_buffer
<
Value
>&
values
,
Size
size
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
static_assert
(
std
::
is_integral
<
Size
>::
value
,
"Size must be an integral type."
);
bool
is_result_in_output
;
cudaError_t
error
=
detail
::
radix_sort_impl
<
Config
,
true
>
(
temporary_storage
,
storage_size
,
keys
.
current
(),
keys
.
current
(),
keys
.
alternate
(),
values
.
current
(),
values
.
current
(),
values
.
alternate
(),
size
,
is_result_in_output
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
temporary_storage
!=
nullptr
&&
is_result_in_output
)
{
keys
.
swap
();
values
.
swap
();
}
return
error
;
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group devicemodule
#endif // ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
3rdparty/cub/rocprim/device/device_radix_sort_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_RADIX_SORT_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_RADIX_SORT_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level radix sort operation.
///
/// Radix sort is excecuted in a single tile (at size < BlocksPerItem) or
/// few iterations (passes) depending on total number of bits to be sorted
/// (\p begin_bit and \p end_bit), each iteration sorts either \p LongRadixBits or \p ShortRadixBits bits
/// choosen to cover whole bit range in optimal way.
///
/// For example, if \p LongRadixBits is 7, \p ShortRadixBits is 6, \p begin_bit is 0 and \p end_bit is 32
/// there will be 5 iterations: 7 + 7 + 6 + 6 + 6 = 32 bits.
///
/// \tparam LongRadixBits - number of bits in long iterations.
/// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits.
/// \tparam ScanConfig - configuration of digits scan kernel. Must be \p kernel_config.
/// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config.
template
<
unsigned
int
LongRadixBits
,
unsigned
int
ShortRadixBits
,
class
ScanConfig
,
class
SortConfig
,
class
SortSingleConfig
=
kernel_config
<
256
,
10
>,
class
SortMergeConfig
=
kernel_config
<
1024
,
1
>
,
unsigned
int
MergeSizeLimitBlocks
=
1024U
,
bool
ForceSingleKernelConfig
=
false
>
struct
radix_sort_config
{
/// \brief Number of bits in long iterations.
static
constexpr
unsigned
int
long_radix_bits
=
LongRadixBits
;
/// \brief Number of bits in short iterations.
static
constexpr
unsigned
int
short_radix_bits
=
ShortRadixBits
;
/// \brief Limit number of blocks to use merge kernel.
static
constexpr
unsigned
int
merge_size_limit_blocks
=
MergeSizeLimitBlocks
;
/// \brief Configuration of digits scan kernel.
using
scan
=
ScanConfig
;
/// \brief Configuration of radix sort kernel.
using
sort
=
SortConfig
;
/// \brief Configuration of radix sort single kernel.
using
sort_single
=
SortSingleConfig
;
/// \brief Configuration of radix sort merge kernel.
using
sort_merge
=
SortMergeConfig
;
/// \brief Force use radix sort single kernel configuration.
static
constexpr
bool
force_single_kernel_config
=
ForceSingleKernelConfig
;
};
namespace
detail
{
template
<
class
Key
,
class
Value
>
struct
radix_sort_config_803
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
using
scan
=
kernel_config
<
256
,
2
>
;
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
8
,
7
,
scan
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
8
,
7
,
scan
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
17
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
scan
,
kernel_config
<
256
,
15
>
,
kernel_config
<
256
,
13
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
scan
,
kernel_config
<
256
,
13
>
,
kernel_config
<
256
,
10
>
>
>
,
radix_sort_config
<
6
,
4
,
scan
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
>
>
;
};
template
<
class
Key
>
struct
radix_sort_config_803
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
radix_sort_config
<
8
,
7
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
radix_sort_config
<
8
,
7
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
16
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
9
>
,
kernel_config
<
256
,
15
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
8
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
7
>
,
kernel_config
<
256
,
12
>
>
>
>
{
};
template
<
class
Key
,
class
Value
>
struct
radix_sort_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
using
scan
=
kernel_config
<
256
,
2
>
;
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
4
,
4
,
scan
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
6
,
5
,
scan
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
17
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
scan
,
kernel_config
<
256
,
15
>
,
kernel_config
<
256
,
15
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
scan
,
kernel_config
<
256
,
15
>
,
kernel_config
<
256
,
12
>
>
>
,
radix_sort_config
<
6
,
4
,
scan
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
>
>
;
};
template
<
class
Key
>
struct
radix_sort_config_900
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
radix_sort_config
<
4
,
3
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
16
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
17
>
,
kernel_config
<
256
,
15
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
8
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
15
>
,
kernel_config
<
256
,
12
>
>
>
>
{
};
template
<
class
Key
,
class
Value
>
struct
radix_sort_config_908
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
using
scan
=
kernel_config
<
256
,
2
>
;
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
4
,
4
,
scan
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
6
,
5
,
scan
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
17
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
4
>
,
kernel_config
<
256
,
15
>
,
kernel_config
<
256
,
15
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
4
>
,
kernel_config
<
256
,
14
>
,
kernel_config
<
256
,
12
>
>
>
,
radix_sort_config
<
6
,
4
,
scan
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
>
>
;
};
template
<
class
Key
>
struct
radix_sort_config_908
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
radix_sort_config
<
4
,
3
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
17
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
4
>
,
kernel_config
<
256
,
17
>
,
kernel_config
<
256
,
15
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
8
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
4
>
,
kernel_config
<
256
,
15
>
,
kernel_config
<
256
,
12
>
>
>
>
{
};
// TODO: We need to update these parameters
template
<
class
Key
,
class
Value
>
struct
radix_sort_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
using
scan
=
kernel_config
<
256
,
1
>
;
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
4
,
4
,
scan
,
kernel_config
<
256
,
5
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
6
,
5
,
scan
,
kernel_config
<
256
,
5
>
,
kernel_config
<
256
,
17
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
scan
,
kernel_config
<
256
,
7
>
,
kernel_config
<
256
,
15
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
scan
,
kernel_config
<
256
,
7
>
,
kernel_config
<
256
,
14
>
>
>
,
radix_sort_config
<
6
,
4
,
scan
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
>
>
;
};
template
<
class
Key
>
struct
radix_sort_config_90a
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
radix_sort_config
<
4
,
3
,
kernel_config
<
256
,
1
>
,
kernel_config
<
256
,
5
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
1
>
,
kernel_config
<
256
,
5
>
,
kernel_config
<
256
,
17
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
1
>
,
kernel_config
<
256
,
8
>
,
kernel_config
<
256
,
15
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
8
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
1
>
,
kernel_config
<
256
,
7
>
,
kernel_config
<
256
,
14
>
>
>
>
{
};
// TODO: We need to update these parameters
template
<
class
Key
,
class
Value
>
struct
radix_sort_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
using
scan
=
kernel_config
<
256
,
2
>
;
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
4
,
4
,
scan
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
6
,
5
,
scan
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
17
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
scan
,
kernel_config
<
256
,
15
>
,
kernel_config
<
256
,
15
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
radix_sort_config
<
7
,
6
,
scan
,
kernel_config
<
256
,
15
>
,
kernel_config
<
256
,
14
>
>
>
,
radix_sort_config
<
6
,
4
,
scan
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_32
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
10u
/
item_scale
)
>
>
>
;
};
template
<
class
Key
>
struct
radix_sort_config_1030
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
radix_sort_config
<
4
,
3
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
10
>
,
kernel_config
<
256
,
19
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
17
>
,
kernel_config
<
256
,
17
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
8
,
radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
2
>
,
kernel_config
<
256
,
15
>
,
kernel_config
<
256
,
15
>
>
>
>
{
};
template
<
unsigned
int
TargetArch
,
class
Key
,
class
Value
>
struct
default_radix_sort_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
radix_sort_config_803
<
Key
,
Value
>
>
,
select_arch_case
<
900
,
radix_sort_config_900
<
Key
,
Value
>
>
,
select_arch_case
<
908
,
radix_sort_config_908
<
Key
,
Value
>
>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
radix_sort_config_90a
<
Key
,
Value
>
>
,
select_arch_case
<
1030
,
radix_sort_config_1030
<
Key
,
Value
>
>
,
radix_sort_config_900
<
Key
,
Value
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_RADIX_SORT_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_reduce.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
#define ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
#include <type_traits>
#include <iterator>
#include <algorithm>
#include <chrono>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "device_reduce_config.hpp"
#include "detail/device_reduce.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
template
<
bool
WithInitialValue
,
class
Config
,
class
ResultType
,
class
InputIterator
,
class
OutputIterator
,
class
InitValueType
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
ROCPRIM_DEFAULT_MAX_BLOCK_SIZE
)
void
block_reduce_kernel
(
InputIterator
input
,
const
size_t
size
,
OutputIterator
output
,
InitValueType
initial_value
,
BinaryFunction
reduce_op
)
{
block_reduce_kernel_impl
<
WithInitialValue
,
Config
,
ResultType
>
(
input
,
size
,
output
,
initial_value
,
reduce_op
);
}
#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto _error = cudaStreamSynchronize(stream); \
if(_error != cudaSuccess) return _error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template
<
bool
WithInitialValue
,
// true when inital_value should be used in reduction
class
Config
,
class
InputIterator
,
class
OutputIterator
,
class
InitValueType
,
class
BinaryFunction
>
inline
cudaError_t
reduce_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
const
InitValueType
initial_value
,
const
size_t
size
,
BinaryFunction
reduce_op
,
const
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
using
result_type
=
typename
::
rocprim
::
detail
::
match_result_type
<
input_type
,
BinaryFunction
>::
type
;
// Get default config if Config is default_config
using
config
=
default_or_custom_config
<
Config
,
default_reduce_config
<
ROCPRIM_TARGET_ARCH
,
result_type
>
>
;
constexpr
unsigned
int
block_size
=
config
::
block_size
;
constexpr
unsigned
int
items_per_thread
=
config
::
items_per_thread
;
constexpr
auto
items_per_block
=
block_size
*
items_per_thread
;
if
(
temporary_storage
==
nullptr
)
{
storage_size
=
reduce_get_temporary_storage_bytes
<
result_type
>
(
size
,
items_per_block
);
// Make sure user won't try to allocate 0 bytes memory
storage_size
=
storage_size
==
0
?
4
:
storage_size
;
return
cudaSuccess
;
}
// Start point for time measurements
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
static
constexpr
auto
size_limit
=
config
::
size_limit
;
static
constexpr
auto
number_of_blocks_limit
=
::
rocprim
::
max
<
size_t
>
(
size_limit
/
items_per_block
,
1
);
auto
number_of_blocks
=
(
size
+
items_per_block
-
1
)
/
items_per_block
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"block_size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"number of blocks limit "
<<
number_of_blocks_limit
<<
'\n'
;
std
::
cout
<<
"items_per_block "
<<
items_per_block
<<
'\n'
;
}
if
(
number_of_blocks
>
1
)
{
// Pointer to array with block_prefixes
result_type
*
block_prefixes
=
static_cast
<
result_type
*>
(
temporary_storage
);
static
constexpr
auto
aligned_size_limit
=
number_of_blocks_limit
*
items_per_block
;
// Launch number_of_blocks_limit blocks while there is still at least as many blocks left as the limit
const
auto
number_of_launch
=
(
size
+
aligned_size_limit
-
1
)
/
aligned_size_limit
;
for
(
size_t
i
=
0
,
offset
=
0
;
i
<
number_of_launch
;
++
i
,
offset
+=
aligned_size_limit
)
{
const
auto
current_size
=
std
::
min
<
size_t
>
(
size
-
offset
,
aligned_size_limit
);
const
auto
current_blocks
=
(
current_size
+
items_per_block
-
1
)
/
items_per_block
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
detail
::
block_reduce_kernel
<
false
,
config
,
result_type
>
<<<
dim3
(
current_blocks
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
+
offset
,
current_size
,
block_prefixes
+
i
*
number_of_blocks_limit
,
initial_value
,
reduce_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"block_reduce_kernel"
,
current_size
,
start
);
}
void
*
nested_temp_storage
=
static_cast
<
void
*>
(
block_prefixes
+
number_of_blocks
);
auto
nested_temp_storage_size
=
storage_size
-
(
number_of_blocks
*
sizeof
(
result_type
));
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
auto
error
=
reduce_impl
<
WithInitialValue
,
config
>
(
nested_temp_storage
,
nested_temp_storage_size
,
block_prefixes
,
// input
output
,
// output
initial_value
,
number_of_blocks
,
// input size
reduce_op
,
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
ROCPRIM_DETAIL_HIP_SYNC
(
"nested_device_reduce"
,
number_of_blocks
,
start
);
}
else
{
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
detail
::
block_reduce_kernel
<
WithInitialValue
,
config
,
result_type
>
<<<
dim3
(
1
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
,
size
,
output
,
initial_value
,
reduce_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"block_reduce_kernel"
,
size
,
start
);
}
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#undef ROCPRIM_DETAIL_HIP_SYNC
}
// end of detail namespace
/// \brief Parallel reduction primitive for device level.
///
/// reduce function performs a device-wide reduction operation
/// using binary \p reduce_op operator.
///
/// \par Overview
/// * Does not support non-commutative reduction operators. Reduction operator should also be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input must have at least \p size elements, while \p output
/// only needs one element.
/// * By default, the input type is used for accumulation. A custom type
/// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for reduction. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the reduction operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to reduce.
/// \param [out] output - iterator to the first element in the output range. It can be
/// same as \p input.
/// \param [in] initial_value - initial value to start the reduction.
/// \param [in] size - number of element in the input range.
/// \param [in] reduce_op - binary operation function object that will be used for reduction.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level min-reduction operation is performed on an array of
/// integer values (<tt>short</tt>s are reduced into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom reduce function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 1 element
/// int start_value; // e.g., 9
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, start_value, input_size, min_op
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform reduce
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, start_value, input_size, min_op
/// );
/// // output: [1]
/// \endcode
///
/// The same example as above, but now a custom accumulator type is specified.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// size_t input_size;
/// short * input;
/// int * output;
/// int start_value;
///
/// // Use a transform iterator to specifiy a custom accumulator type
/// auto input_iterator = rocprim::make_transform_iterator(
/// input, [] __device__ (T in) { return static_cast<int>(in); });
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Use the transform iterator
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, start_value, input_size, min_op
/// );
///
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, start_value, input_size, min_op
/// );
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
InitValueType
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
>
inline
cudaError_t
reduce
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
const
InitValueType
initial_value
,
const
size_t
size
,
BinaryFunction
reduce_op
=
BinaryFunction
(),
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
return
detail
::
reduce_impl
<
true
,
Config
>
(
temporary_storage
,
storage_size
,
input
,
output
,
initial_value
,
size
,
reduce_op
,
stream
,
debug_synchronous
);
}
/// \brief Parallel reduce primitive for device level.
///
/// reduce function performs a device-wide reduction operation
/// using binary \p reduce_op operator.
///
/// \par Overview
/// * Does not support non-commutative reduction operators. Reduction operator should also be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input must have at least \p size elements, while \p output
/// only needs one element.
/// * By default, the input type is used for accumulation. A custom type
/// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for reduction. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the reduction operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to reduce.
/// \param [out] output - iterator to the first element in the output range. It can be
/// same as \p input.
/// \param [in] size - number of element in the input range.
/// \param [in] reduce_op - binary operation function object that will be used for reduction.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level sum operation is performed on an array of
/// integer values (<tt>short</tt>s are reduced into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size, rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform reduce
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size, rocprim::plus<int>()
/// );
/// // output: [36]
/// \endcode
///
/// The same example as above, but now a custom accumulator type is specified.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// size_t input_size;
/// short * input;
/// int * output;
///
/// // Use a transform iterator to specifiy a custom accumulator type
/// auto input_iterator = rocprim::make_transform_iterator(
/// input, [] __device__ (T in) { return static_cast<int>(in); });
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Use the transform iterator
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, start_value, input_size, rocprim::plus<int>()
/// );
///
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// rocprim::reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, start_value, input_size, rocprim::plus<int>()
/// );
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
>
inline
cudaError_t
reduce
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
const
size_t
size
,
BinaryFunction
reduce_op
=
BinaryFunction
(),
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
return
detail
::
reduce_impl
<
false
,
Config
>
(
temporary_storage
,
storage_size
,
input
,
output
,
input_type
(),
size
,
reduce_op
,
stream
,
debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
3rdparty/cub/rocprim/device/device_reduce_by_key.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_
#define ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_
#include <iterator>
#include <iostream>
#include <chrono>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "../functional.hpp"
#include "device_reduce_by_key_config.hpp"
#include "detail/device_reduce_by_key.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
class
KeysInputIterator
,
class
KeyCompareFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
fill_unique_counts_kernel
(
KeysInputIterator
keys_input
,
unsigned
int
size
,
unsigned
int
*
unique_counts
,
KeyCompareFunction
key_compare_op
,
unsigned
int
blocks_per_full_batch
,
unsigned
int
full_batches
)
{
fill_unique_counts
<
BlockSize
,
ItemsPerThread
>
(
keys_input
,
size
,
unique_counts
,
key_compare_op
,
blocks_per_full_batch
,
full_batches
);
}
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
class
UniqueCountOutputIterator
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
scan_unique_counts_kernel
(
unsigned
int
*
unique_counts
,
UniqueCountOutputIterator
unique_count_output
,
unsigned
int
batches
)
{
scan_unique_counts
<
BlockSize
,
ItemsPerThread
>
(
unique_counts
,
unique_count_output
,
batches
);
}
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
class
KeysInputIterator
,
class
ValuesInputIterator
,
class
Result
,
class
UniqueOutputIterator
,
class
AggregatesOutputIterator
,
class
KeyCompareFunction
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
reduce_by_key_kernel
(
KeysInputIterator
keys_input
,
ValuesInputIterator
values_input
,
unsigned
int
size
,
const
unsigned
int
*
unique_starts
,
carry_out
<
Result
>
*
carry_outs
,
Result
*
leading_aggregates
,
UniqueOutputIterator
unique_output
,
AggregatesOutputIterator
aggregates_output
,
KeyCompareFunction
key_compare_op
,
BinaryFunction
reduce_op
,
unsigned
int
blocks_per_full_batch
,
unsigned
int
full_batches
)
{
reduce_by_key
<
BlockSize
,
ItemsPerThread
>
(
keys_input
,
values_input
,
size
,
unique_starts
,
carry_outs
,
leading_aggregates
,
unique_output
,
aggregates_output
,
key_compare_op
,
reduce_op
,
blocks_per_full_batch
,
full_batches
);
}
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
class
Result
,
class
AggregatesOutputIterator
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
scan_and_scatter_carry_outs_kernel
(
const
carry_out
<
Result
>
*
carry_outs
,
const
Result
*
leading_aggregates
,
AggregatesOutputIterator
aggregates_output
,
BinaryFunction
reduce_op
,
unsigned
int
batches
)
{
scan_and_scatter_carry_outs
<
BlockSize
,
ItemsPerThread
>
(
carry_outs
,
leading_aggregates
,
aggregates_output
,
reduce_op
,
batches
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template
<
class
Config
,
class
KeysInputIterator
,
class
ValuesInputIterator
,
class
UniqueOutputIterator
,
class
AggregatesOutputIterator
,
class
UniqueCountOutputIterator
,
class
BinaryFunction
,
class
KeyCompareFunction
>
inline
cudaError_t
reduce_by_key_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
ValuesInputIterator
values_input
,
const
unsigned
int
size
,
UniqueOutputIterator
unique_output
,
AggregatesOutputIterator
aggregates_output
,
UniqueCountOutputIterator
unique_count_output
,
BinaryFunction
reduce_op
,
KeyCompareFunction
key_compare_op
,
const
cudaStream_t
stream
,
const
bool
debug_synchronous
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
result_type
=
typename
::
rocprim
::
detail
::
match_result_type
<
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
,
BinaryFunction
>::
type
;
using
carry_out_type
=
carry_out
<
result_type
>
;
using
config
=
default_or_custom_config
<
Config
,
default_reduce_by_key_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
result_type
>
>
;
constexpr
unsigned
int
items_per_block
=
config
::
reduce
::
block_size
*
config
::
reduce
::
items_per_thread
;
constexpr
unsigned
int
scan_items_per_block
=
config
::
scan
::
block_size
*
config
::
scan
::
items_per_thread
;
const
unsigned
int
blocks
=
std
::
max
(
1u
,
::
rocprim
::
detail
::
ceiling_div
(
size
,
items_per_block
));
const
unsigned
int
blocks_per_full_batch
=
::
rocprim
::
detail
::
ceiling_div
(
blocks
,
scan_items_per_block
);
const
unsigned
int
full_batches
=
blocks
%
scan_items_per_block
!=
0
?
blocks
%
scan_items_per_block
:
scan_items_per_block
;
const
unsigned
int
batches
=
(
blocks_per_full_batch
==
1
?
full_batches
:
scan_items_per_block
);
const
size_t
unique_counts_bytes
=
::
rocprim
::
detail
::
align_size
(
batches
*
sizeof
(
unsigned
int
));
const
size_t
carry_outs_bytes
=
::
rocprim
::
detail
::
align_size
(
batches
*
sizeof
(
carry_out_type
));
const
size_t
leading_aggregates_bytes
=
::
rocprim
::
detail
::
align_size
(
batches
*
sizeof
(
result_type
));
if
(
temporary_storage
==
nullptr
)
{
storage_size
=
unique_counts_bytes
+
carry_outs_bytes
+
leading_aggregates_bytes
;
return
cudaSuccess
;
}
if
(
debug_synchronous
)
{
std
::
cout
<<
"blocks "
<<
blocks
<<
'\n'
;
std
::
cout
<<
"blocks_per_full_batch "
<<
blocks_per_full_batch
<<
'\n'
;
std
::
cout
<<
"full_batches "
<<
full_batches
<<
'\n'
;
std
::
cout
<<
"batches "
<<
batches
<<
'\n'
;
std
::
cout
<<
"storage_size "
<<
storage_size
<<
'\n'
;
cudaError_t
error
=
cudaStreamSynchronize
(
stream
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
char
*
ptr
=
reinterpret_cast
<
char
*>
(
temporary_storage
);
unsigned
int
*
unique_counts
=
reinterpret_cast
<
unsigned
int
*>
(
ptr
);
ptr
+=
unique_counts_bytes
;
carry_out_type
*
carry_outs
=
reinterpret_cast
<
carry_out_type
*>
(
ptr
);
ptr
+=
carry_outs_bytes
;
result_type
*
leading_aggregates
=
reinterpret_cast
<
result_type
*>
(
ptr
);
// Start point for time measurements
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
fill_unique_counts_kernel
<
config
::
reduce
::
block_size
,
config
::
reduce
::
items_per_thread
>
<<<
dim3
(
batches
),
dim3
(
config
::
reduce
::
block_size
),
0
,
stream
>>>
(
keys_input
,
size
,
unique_counts
,
key_compare_op
,
blocks_per_full_batch
,
full_batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"fill_unique_counts"
,
size
,
start
)
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
scan_unique_counts_kernel
<
config
::
scan
::
block_size
,
config
::
scan
::
items_per_thread
>
<<<
dim3
(
1
),
dim3
(
config
::
scan
::
block_size
),
0
,
stream
>>>
(
unique_counts
,
unique_count_output
,
batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"scan_unique_counts"
,
config
::
scan
::
block_size
,
start
)
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
reduce_by_key_kernel
<
config
::
reduce
::
block_size
,
config
::
reduce
::
items_per_thread
>
<<<
dim3
(
batches
),
dim3
(
config
::
reduce
::
block_size
),
0
,
stream
>>>
(
keys_input
,
values_input
,
size
,
const_cast
<
const
unsigned
int
*>
(
unique_counts
),
carry_outs
,
leading_aggregates
,
unique_output
,
aggregates_output
,
key_compare_op
,
reduce_op
,
blocks_per_full_batch
,
full_batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"reduce_by_key"
,
size
,
start
)
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
scan_and_scatter_carry_outs_kernel
<
config
::
scan
::
block_size
,
config
::
scan
::
items_per_thread
>
<<<
dim3
(
1
),
dim3
(
config
::
scan
::
block_size
),
0
,
stream
>>>
(
const_cast
<
const
carry_out_type
*>
(
carry_outs
),
const_cast
<
const
result_type
*>
(
leading_aggregates
),
aggregates_output
,
reduce_op
,
batches
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"scan_and_scatter_carry_outs"
,
config
::
scan
::
block_size
,
start
)
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
}
// end of detail namespace
/// \brief Parallel reduce-by-key primitive for device level.
///
/// reduce_by_key function performs a device-wide reduction operation of groups
/// of consecutive values having the same key using binary \p reduce_op operator. The first key of each group
/// is copied to \p unique_output and reduction of the group is written to \p aggregates_output.
/// The total number of group is written to \p unique_count_output.
///
/// \par Overview
/// * Supports non-commutative reduction operators. However, a reduction operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p keys_input and \p values_input must have at least \p size elements.
/// * Range specified by \p unique_count_output must have at least 1 element.
/// * Ranges specified by \p unique_output and \p aggregates_output must have at least
/// <tt>*unique_count_output</tt> (i.e. the number of unique keys) elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_by_key_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam UniqueOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam AggregatesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam UniqueCountOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for reduction. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p ValuesInputIterator.
/// \tparam KeyCompareFunction - type of binary function used to determine keys equality. Default type
/// is \p rocprim::equal_to<T>, where \p T is a \p value_type of \p KeysInputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the reduction operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - iterator to the first element in the range of keys.
/// \param [in] values_input - iterator to the first element in the range of values to reduce.
/// \param [in] size - number of element in the input range.
/// \param [out] unique_output - iterator to the first element in the output range of unique keys.
/// \param [out] aggregates_output - iterator to the first element in the output range of reductions.
/// \param [out] unique_count_output - iterator to total number of groups.
/// \param [in] reduce_op - binary operation function object that will be used for reduction.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] key_compare_op - binary operation function object that will be used to determine keys equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is KeyCompareFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level sum operation is performed on an array of
/// integer values and integer keys.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
/// int * values_input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * unique_output; // empty array of at least 4 elements
/// int * aggregates_output; // empty array of at least 4 elements
/// int * unique_count_output; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::reduce_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input, input_size,
/// unique_output, aggregates_output, unique_count_output
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform reduction
/// rocprim::reduce_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input, input_size,
/// unique_output, aggregates_output, unique_count_output
/// );
/// // unique_output: [1, 2, 10, 88]
/// // aggregates_output: [6, 4, 18, 8]
/// // unique_count_output: [4]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
ValuesInputIterator
,
class
UniqueOutputIterator
,
class
AggregatesOutputIterator
,
class
UniqueCountOutputIterator
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
ValuesInputIterator
>
::
value_type
>
,
class
KeyCompareFunction
=
::
rocprim
::
equal_to
<
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
>
>
inline
cudaError_t
reduce_by_key
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
ValuesInputIterator
values_input
,
unsigned
int
size
,
UniqueOutputIterator
unique_output
,
AggregatesOutputIterator
aggregates_output
,
UniqueCountOutputIterator
unique_count_output
,
BinaryFunction
reduce_op
=
BinaryFunction
(),
KeyCompareFunction
key_compare_op
=
KeyCompareFunction
(),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
return
detail
::
reduce_by_key_impl
<
Config
>
(
temporary_storage
,
storage_size
,
keys_input
,
values_input
,
size
,
unique_output
,
aggregates_output
,
unique_count_output
,
reduce_op
,
key_compare_op
,
stream
,
debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_HPP_
3rdparty/cub/rocprim/device/device_reduce_by_key_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level reduce-by-key operation.
///
/// \tparam ScanConfig - configuration of carry-outs scan kernel. Must be \p kernel_config.
/// \tparam ReduceConfig - configuration of the main reduce-by-key kernel. Must be \p kernel_config.
template
<
class
ScanConfig
,
class
ReduceConfig
>
struct
reduce_by_key_config
{
/// \brief Configuration of carry-outs scan kernel.
using
scan
=
ScanConfig
;
/// \brief Configuration of the main reduce-by-key kernel.
using
reduce
=
ReduceConfig
;
};
namespace
detail
{
template
<
class
Key
,
class
Value
>
struct
reduce_by_key_config_803
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
)
+
sizeof
(
Value
),
2
*
sizeof
(
int
));
using
scan
=
kernel_config
<
256
,
4
>
;
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
<=
8
&&
sizeof
(
Value
)
<=
8
),
reduce_by_key_config
<
scan
,
kernel_config
<
256
,
7
>
>
>
,
reduce_by_key_config
<
scan
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
>
>
;
};
template
<
class
Key
,
class
Value
>
struct
reduce_by_key_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
)
+
sizeof
(
Value
),
2
*
sizeof
(
int
));
using
scan
=
kernel_config
<
256
,
2
>
;
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
<=
8
&&
sizeof
(
Value
)
<=
8
),
reduce_by_key_config
<
scan
,
kernel_config
<
256
,
10
>
>
>
,
reduce_by_key_config
<
scan
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
>
>
;
};
// TODO: We need to update these parameters
template
<
class
Key
,
class
Value
>
struct
reduce_by_key_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
)
+
sizeof
(
Value
),
2
*
sizeof
(
int
));
using
scan
=
kernel_config
<
256
,
2
>
;
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
<=
8
&&
sizeof
(
Value
)
<=
8
),
reduce_by_key_config
<
scan
,
kernel_config
<
256
,
10
>
>
>
,
reduce_by_key_config
<
scan
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
>
>
;
};
// TODO: We need to update these parameters
template
<
class
Key
,
class
Value
>
struct
reduce_by_key_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
)
+
sizeof
(
Value
),
2
*
sizeof
(
int
));
using
scan
=
kernel_config
<
256
,
2
>
;
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
<=
8
&&
sizeof
(
Value
)
<=
8
),
reduce_by_key_config
<
scan
,
kernel_config
<
256
,
10
>
>
>
,
reduce_by_key_config
<
scan
,
kernel_config
<
limit_block_size
<
256U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_32
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
>
>
;
};
template
<
unsigned
int
TargetArch
,
class
Key
,
class
Value
>
struct
default_reduce_by_key_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
reduce_by_key_config_803
<
Key
,
Value
>
>
,
select_arch_case
<
900
,
reduce_by_key_config_900
<
Key
,
Value
>
>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
reduce_by_key_config_90a
<
Key
,
Value
>
>
,
select_arch_case
<
1030
,
reduce_by_key_config_1030
<
Key
,
Value
>
>
,
reduce_by_key_config_900
<
Key
,
Value
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_reduce_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_REDUCE_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../block/block_reduce.hpp"
#include "config_types.hpp"
#include "detail/device_config_helper.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
template
<
class
Value
>
struct
reduce_config_803
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
reduce_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
),
::
rocprim
::
block_reduce_algorithm
::
using_warp_reduce
>
;
};
template
<
class
Value
>
struct
reduce_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
reduce_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
),
::
rocprim
::
block_reduce_algorithm
::
using_warp_reduce
>
;
};
// TODO: We need to update these parameters
template
<
class
Value
>
struct
reduce_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
reduce_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
),
::
rocprim
::
block_reduce_algorithm
::
using_warp_reduce
>
;
};
// TODO: We need to update these parameters
template
<
class
Value
>
struct
reduce_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
reduce_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_32
>::
value
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
),
::
rocprim
::
block_reduce_algorithm
::
using_warp_reduce
>
;
};
template
<
unsigned
int
TargetArch
,
class
Value
>
struct
default_reduce_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
reduce_config_803
<
Value
>>
,
select_arch_case
<
900
,
reduce_config_900
<
Value
>>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
reduce_config_90a
<
Value
>>
,
select_arch_case
<
1030
,
reduce_config_1030
<
Value
>>
,
reduce_config_900
<
Value
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_REDUCE_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_run_length_encode.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2020 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
#define ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../iterator/constant_iterator.hpp"
#include "../iterator/counting_iterator.hpp"
#include "../iterator/discard_iterator.hpp"
#include "../iterator/zip_iterator.hpp"
#include "device_run_length_encode_config.hpp"
#include "device_reduce_by_key.hpp"
#include "device_select.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
if(error != cudaSuccess) return error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
} \
}
}
// end detail namespace
/// \brief Parallel run-length encoding for device level.
///
/// run_length_encode function performs a device-wide run-length encoding of runs (groups)
/// of consecutive values. The first value of each run is copied to \p unique_output and
/// the length of the run is written to \p counts_output.
/// The total number of runs is written to \p runs_count_output.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Range specified by \p input must have at least \p size elements.
/// * Range specified by \p runs_count_output must have at least 1 element.
/// * Ranges specified by \p unique_output and \p counts_output must have at least
/// <tt>*runs_count_output</tt> (i.e. the number of runs) elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam UniqueOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam CountsOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam RunsCountOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range of values.
/// \param [in] size - number of element in the input range.
/// \param [out] unique_output - iterator to the first element in the output range of unique values.
/// \param [out] counts_output - iterator to the first element in the output range of lenghts.
/// \param [out] runs_count_output - iterator to total number of runs.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful operation; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level run-length encoding operation is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
/// int * unique_output; // empty array of at least 4 elements
/// int * counts_output; // empty array of at least 4 elements
/// int * runs_count_output; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::run_length_encode(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, input_size,
/// unique_output, counts_output, runs_count_output
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform encoding
/// rocprim::run_length_encode(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, input_size,
/// unique_output, counts_output, runs_count_output
/// );
/// // unique_output: [1, 2, 10, 88]
/// // counts_output: [3, 1, 3, 1]
/// // runs_count_output: [4]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
UniqueOutputIterator
,
class
CountsOutputIterator
,
class
RunsCountOutputIterator
>
inline
cudaError_t
run_length_encode
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
unsigned
int
size
,
UniqueOutputIterator
unique_output
,
CountsOutputIterator
counts_output
,
RunsCountOutputIterator
runs_count_output
,
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
using
count_type
=
unsigned
int
;
using
config
=
detail
::
default_or_custom_config
<
Config
,
detail
::
default_run_length_encode_config
>
;
return
::
rocprim
::
reduce_by_key
<
typename
config
::
reduce_by_key
>
(
temporary_storage
,
storage_size
,
input
,
make_constant_iterator
<
count_type
>
(
1
),
size
,
unique_output
,
counts_output
,
runs_count_output
,
::
rocprim
::
plus
<
count_type
>
(),
::
rocprim
::
equal_to
<
input_type
>
(),
stream
,
debug_synchronous
);
}
/// \brief Parallel run-length encoding of non-trivial runs for device level.
///
/// run_length_encode_non_trivial_runs function performs a device-wide run-length encoding of
/// non-trivial runs (groups) of consecutive values (groups of more than one element).
/// The offset of the first value of each non-trivial run is copied to \p offsets_output and
/// the length of the run (the count of elements) is written to \p counts_output.
/// The total number of non-trivial runs is written to \p runs_count_output.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Range specified by \p input must have at least \p size elements.
/// * Range specified by \p runs_count_output must have at least 1 element.
/// * Ranges specified by \p offsets_output and \p counts_output must have at least
/// <tt>*runs_count_output</tt> (i.e. the number of non-trivial runs) elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OffsetsOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam CountsOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam RunsCountOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range of values.
/// \param [in] size - number of element in the input range.
/// \param [out] offsets_output - iterator to the first element in the output range of offsets.
/// \param [out] counts_output - iterator to the first element in the output range of lenghts.
/// \param [out] runs_count_output - iterator to total number of runs.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful operation; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level run-length encoding of non-trivial runs is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
/// int * offsets_output; // empty array of at least 2 elements
/// int * counts_output; // empty array of at least 2 elements
/// int * runs_count_output; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::run_length_encode_non_trivial_runs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, input_size,
/// offsets_output, counts_output, runs_count_output
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform encoding
/// rocprim::run_length_encode_non_trivial_runs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, input_size,
/// offsets_output, counts_output, runs_count_output
/// );
/// // offsets_output: [0, 4]
/// // counts_output: [3, 3]
/// // runs_count_output: [2]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OffsetsOutputIterator
,
class
CountsOutputIterator
,
class
RunsCountOutputIterator
>
inline
cudaError_t
run_length_encode_non_trivial_runs
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
unsigned
int
size
,
OffsetsOutputIterator
offsets_output
,
CountsOutputIterator
counts_output
,
RunsCountOutputIterator
runs_count_output
,
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
using
offset_type
=
unsigned
int
;
using
count_type
=
unsigned
int
;
using
offset_count_pair
=
typename
::
rocprim
::
tuple
<
offset_type
,
count_type
>
;
using
config
=
detail
::
default_or_custom_config
<
Config
,
detail
::
default_run_length_encode_config
>
;
cudaError_t
error
;
auto
reduce_op
=
[]
__device__
(
const
offset_count_pair
&
a
,
const
offset_count_pair
&
b
)
{
return
offset_count_pair
(
::
rocprim
::
get
<
0
>
(
a
),
// Always use offset of the first item of the run
::
rocprim
::
get
<
1
>
(
a
)
+
::
rocprim
::
get
<
1
>
(
b
)
// Number of items in the run
);
};
auto
non_trivial_runs_select_op
=
[]
__device__
(
const
offset_count_pair
&
a
)
{
return
::
rocprim
::
get
<
1
>
(
a
)
>
1
;
};
offset_type
*
offsets_tmp
=
nullptr
;
count_type
*
counts_tmp
=
nullptr
;
count_type
*
all_runs_count_tmp
=
nullptr
;
// Calculate size of temporary storage for reduce_by_key operation
size_t
reduce_by_key_bytes
;
error
=
::
rocprim
::
reduce_by_key
<
typename
config
::
reduce_by_key
>
(
nullptr
,
reduce_by_key_bytes
,
input
,
::
rocprim
::
make_zip_iterator
(
::
rocprim
::
make_tuple
(
::
rocprim
::
make_counting_iterator
<
offset_type
>
(
0
),
::
rocprim
::
make_constant_iterator
<
count_type
>
(
1
)
)
),
size
,
::
rocprim
::
make_discard_iterator
(),
::
rocprim
::
make_zip_iterator
(
::
rocprim
::
make_tuple
(
offsets_tmp
,
counts_tmp
)),
all_runs_count_tmp
,
reduce_op
,
::
rocprim
::
equal_to
<
input_type
>
(),
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
reduce_by_key_bytes
=
::
rocprim
::
detail
::
align_size
(
reduce_by_key_bytes
);
// Calculate size of temporary storage for select operation
size_t
select_bytes
;
error
=
::
rocprim
::
select
<
typename
config
::
select
>
(
nullptr
,
select_bytes
,
::
rocprim
::
make_zip_iterator
(
::
rocprim
::
make_tuple
(
offsets_tmp
,
counts_tmp
)),
::
rocprim
::
make_zip_iterator
(
::
rocprim
::
make_tuple
(
offsets_output
,
counts_output
)),
runs_count_output
,
size
,
non_trivial_runs_select_op
,
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
select_bytes
=
::
rocprim
::
detail
::
align_size
(
select_bytes
);
const
size_t
offsets_tmp_bytes
=
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
offset_type
));
const
size_t
counts_tmp_bytes
=
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
count_type
));
const
size_t
all_runs_count_tmp_bytes
=
sizeof
(
count_type
);
if
(
temporary_storage
==
nullptr
)
{
storage_size
=
::
rocprim
::
max
(
reduce_by_key_bytes
,
select_bytes
)
+
offsets_tmp_bytes
+
counts_tmp_bytes
+
all_runs_count_tmp_bytes
;
return
cudaSuccess
;
}
char
*
ptr
=
reinterpret_cast
<
char
*>
(
temporary_storage
);
ptr
+=
::
rocprim
::
max
(
reduce_by_key_bytes
,
select_bytes
);
offsets_tmp
=
reinterpret_cast
<
offset_type
*>
(
ptr
);
ptr
+=
offsets_tmp_bytes
;
counts_tmp
=
reinterpret_cast
<
count_type
*>
(
ptr
);
ptr
+=
counts_tmp_bytes
;
all_runs_count_tmp
=
reinterpret_cast
<
count_type
*>
(
ptr
);
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
error
=
::
rocprim
::
reduce_by_key
<
typename
config
::
reduce_by_key
>
(
temporary_storage
,
reduce_by_key_bytes
,
input
,
::
rocprim
::
make_zip_iterator
(
::
rocprim
::
make_tuple
(
::
rocprim
::
make_counting_iterator
<
offset_type
>
(
0
),
::
rocprim
::
make_constant_iterator
<
count_type
>
(
1
)
)
),
size
,
::
rocprim
::
make_discard_iterator
(),
// Ignore unique output
::
rocprim
::
make_zip_iterator
(
rocprim
::
make_tuple
(
offsets_tmp
,
counts_tmp
)),
all_runs_count_tmp
,
reduce_op
,
::
rocprim
::
equal_to
<
input_type
>
(),
stream
,
debug_synchronous
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"rocprim::reduce_by_key"
,
size
,
start
)
// Read count of all runs (including trivial runs)
count_type
all_runs_count
;
// cudaMemcpyWithStream is only supported on rocm 3.1 and above
error
=
cudaMemcpyAsync
(
&
all_runs_count
,
all_runs_count_tmp
,
sizeof
(
count_type
),
cudaMemcpyDeviceToHost
,
stream
);
if
(
error
!=
cudaSuccess
)
return
error
;
error
=
cudaStreamSynchronize
(
stream
);
// Select non-trivial runs
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
error
=
::
rocprim
::
select
<
typename
config
::
select
>
(
temporary_storage
,
select_bytes
,
::
rocprim
::
make_zip_iterator
(
::
rocprim
::
make_tuple
(
offsets_tmp
,
counts_tmp
)),
::
rocprim
::
make_zip_iterator
(
::
rocprim
::
make_tuple
(
offsets_output
,
counts_output
)),
runs_count_output
,
all_runs_count
,
non_trivial_runs_select_op
,
stream
,
debug_synchronous
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"rocprim::select"
,
all_runs_count
,
start
)
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
3rdparty/cub/rocprim/device/device_run_length_encode_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level run-length encoding operation.
///
/// \tparam ReduceByKeyConfig - configuration of device-level reduce-by-key operation.
/// Must be \p reduce_by_key_config or \p default_config.
/// \tparam SelectConfig - configuration of device-level select operation.
/// Must be \p select_config or \p default_config.
template
<
class
ReduceByKeyConfig
,
class
SelectConfig
=
default_config
>
struct
run_length_encode_config
{
/// \brief Configuration of device-level reduce-by-key operation.
using
reduce_by_key
=
ReduceByKeyConfig
;
/// \brief Configuration of device-level select operation.
using
select
=
SelectConfig
;
};
namespace
detail
{
using
default_run_length_encode_config
=
run_length_encode_config
<
default_config
,
default_config
>
;
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_scan.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
#define ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../functional.hpp"
#include "../type_traits.hpp"
#include "../types/future_value.hpp"
#include "../detail/various.hpp"
#include "device_scan_config.hpp"
#include "device_transform.hpp"
#include "detail/device_scan_common.hpp"
#include "detail/device_scan_lookback.hpp"
#include "detail/device_scan_reduce_then_scan.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
// Single kernel scan (performs scan on one thread block only)
template
<
bool
Exclusive
,
class
Config
,
class
InputIterator
,
class
OutputIterator
,
class
BinaryFunction
,
class
InitValueType
>
ROCPRIM_KERNEL
__launch_bounds__
(
ROCPRIM_DEFAULT_MAX_BLOCK_SIZE
)
void
single_scan_kernel
(
InputIterator
input
,
const
size_t
size
,
const
InitValueType
initial_value
,
OutputIterator
output
,
BinaryFunction
scan_op
)
{
single_scan_kernel_impl
<
Exclusive
,
Config
>
(
input
,
size
,
get_input_value
(
initial_value
),
output
,
scan_op
);
}
// Reduce-then-scan kernels
// Calculates block prefixes that will be used in final_scan_kernel
// when performing block scan operations.
template
<
class
Config
,
class
InputIterator
,
class
BinaryFunction
,
class
ResultType
>
ROCPRIM_KERNEL
__launch_bounds__
(
ROCPRIM_DEFAULT_MAX_BLOCK_SIZE
)
void
block_reduce_kernel
(
InputIterator
input
,
BinaryFunction
scan_op
,
ResultType
*
block_prefixes
)
{
block_reduce_kernel_impl
<
Config
>
(
input
,
scan_op
,
block_prefixes
);
}
template
<
bool
Exclusive
,
class
Config
,
class
InputIterator
,
class
OutputIterator
,
class
BinaryFunction
,
class
InitValueType
>
ROCPRIM_KERNEL
__launch_bounds__
(
ROCPRIM_DEFAULT_MAX_BLOCK_SIZE
)
void
final_scan_kernel
(
InputIterator
input
,
const
size_t
size
,
OutputIterator
output
,
const
InitValueType
initial_value
,
BinaryFunction
scan_op
,
input_type_t
<
InitValueType
>*
block_prefixes
,
input_type_t
<
InitValueType
>*
previous_last_element
=
nullptr
,
input_type_t
<
InitValueType
>*
new_last_element
=
nullptr
,
bool
override_first_value
=
false
,
bool
save_last_value
=
false
)
{
final_scan_kernel_impl
<
Exclusive
,
Config
>
(
input
,
size
,
output
,
get_input_value
(
initial_value
),
scan_op
,
block_prefixes
,
previous_last_element
,
new_last_element
,
override_first_value
,
save_last_value
);
}
// Single pass (look-back kernels)
template
<
bool
Exclusive
,
class
Config
,
class
InputIterator
,
class
OutputIterator
,
class
BinaryFunction
,
class
InitValueType
,
class
LookBackScanState
>
ROCPRIM_KERNEL
__launch_bounds__
(
ROCPRIM_DEFAULT_MAX_BLOCK_SIZE
)
void
lookback_scan_kernel
(
InputIterator
input
,
OutputIterator
output
,
const
size_t
size
,
const
InitValueType
initial_value
,
BinaryFunction
scan_op
,
LookBackScanState
lookback_scan_state
,
const
unsigned
int
number_of_blocks
,
ordered_block_id
<
unsigned
int
>
ordered_bid
,
input_type_t
<
InitValueType
>*
previous_last_element
=
nullptr
,
input_type_t
<
InitValueType
>*
new_last_element
=
nullptr
,
bool
override_first_value
=
false
,
bool
save_last_value
=
false
)
{
lookback_scan_kernel_impl
<
Exclusive
,
Config
>
(
input
,
output
,
size
,
get_input_value
(
initial_value
),
scan_op
,
lookback_scan_state
,
number_of_blocks
,
ordered_bid
,
previous_last_element
,
new_last_element
,
override_first_value
,
save_last_value
);
}
#define ROCPRIM_DETAIL_HIP_SYNC(name, size, start) \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template
<
bool
Exclusive
,
class
Config
,
class
InputIterator
,
class
OutputIterator
,
class
InitValueType
,
class
BinaryFunction
>
inline
auto
scan_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
const
InitValueType
initial_value
,
const
size_t
size
,
BinaryFunction
scan_op
,
const
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<!
Config
::
use_lookback
,
cudaError_t
>::
type
{
using
config
=
Config
;
using
real_init_value_type
=
input_type_t
<
InitValueType
>
;
constexpr
unsigned
int
block_size
=
config
::
block_size
;
constexpr
unsigned
int
items_per_thread
=
config
::
items_per_thread
;
constexpr
auto
items_per_block
=
block_size
*
items_per_thread
;
static
constexpr
size_t
size_limit
=
config
::
size_limit
;
static
constexpr
size_t
aligned_size_limit
=
::
rocprim
::
max
<
size_t
>
(
size_limit
-
size_limit
%
items_per_block
,
items_per_block
);
size_t
limited_size
=
std
::
min
<
size_t
>
(
size
,
aligned_size_limit
);
const
bool
use_limited_size
=
limited_size
==
aligned_size_limit
;
size_t
nested_prefixes_size_bytes
=
scan_get_temporary_storage_bytes
<
real_init_value_type
>
(
limited_size
,
items_per_block
);
// Calculate required temporary storage
if
(
temporary_storage
==
nullptr
)
{
storage_size
=
nested_prefixes_size_bytes
;
if
(
use_limited_size
)
storage_size
+=
4
*
sizeof
(
real_init_value_type
);
// Make sure user won't try to allocate 0 bytes memory, because
// cudaMalloc will return nullptr when size is zero.
storage_size
=
storage_size
==
0
?
4
:
storage_size
;
return
cudaSuccess
;
}
// Start point for time measurements
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
auto
number_of_blocks
=
(
size
+
items_per_block
-
1
)
/
items_per_block
;
if
(
number_of_blocks
==
0u
)
return
cudaSuccess
;
if
(
number_of_blocks
>
1
)
{
unsigned
int
number_of_launch
=
(
size
+
limited_size
-
1
)
/
limited_size
;
for
(
size_t
i
=
0
,
offset
=
0
;
i
<
number_of_launch
;
i
++
,
offset
+=
limited_size
)
{
size_t
current_size
=
std
::
min
<
size_t
>
(
size
-
offset
,
limited_size
);
number_of_blocks
=
(
current_size
+
items_per_block
-
1
)
/
items_per_block
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"use_limited_size "
<<
use_limited_size
<<
'\n'
;
std
::
cout
<<
"number_of_launch "
<<
number_of_launch
<<
'\n'
;
std
::
cout
<<
"inex "
<<
i
<<
'\n'
;
std
::
cout
<<
"aligned_size_limit "
<<
aligned_size_limit
<<
'\n'
;
std
::
cout
<<
"size "
<<
current_size
<<
'\n'
;
std
::
cout
<<
"block_size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"items_per_block "
<<
items_per_block
<<
'\n'
;
std
::
cout
.
flush
();
}
// Pointer to array with block_prefixes
char
*
ptr
=
reinterpret_cast
<
char
*>
(
temporary_storage
);
real_init_value_type
*
block_prefixes
=
reinterpret_cast
<
real_init_value_type
*>
(
ptr
);
real_init_value_type
*
previous_last_element
=
nullptr
;
real_init_value_type
*
new_last_element
=
nullptr
;
if
(
use_limited_size
)
{
ptr
+=
nested_prefixes_size_bytes
;
previous_last_element
=
reinterpret_cast
<
real_init_value_type
*>
(
ptr
);
ptr
+=
sizeof
(
real_init_value_type
);
new_last_element
=
reinterpret_cast
<
real_init_value_type
*>
(
ptr
);
}
// Grid size for block_reduce_kernel, we don't need to calculate reduction
// of the last block as it will never be used as prefix for other blocks
auto
grid_size
=
number_of_blocks
-
1
;
if
(
grid_size
!=
0
)
{
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
detail
::
block_reduce_kernel
<
config
,
InputIterator
,
BinaryFunction
,
real_init_value_type
>
<<<
dim3
(
grid_size
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
+
offset
,
scan_op
,
block_prefixes
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"block_reduce_kernel"
,
current_size
,
start
)
if
(
!
Exclusive
&&
i
>
0
)
{
cudaError_t
error
=
::
rocprim
::
transform
(
previous_last_element
,
block_prefixes
,
block_prefixes
,
1
,
scan_op
,
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
// TODO: Performance may increase if for (number_of_blocks < 8192) (or some other
// threshold) we would just use CPU to calculate prefixes.
// Calculate size of temporary storage for nested device scan operation
void
*
nested_temp_storage
=
static_cast
<
void
*>
(
block_prefixes
+
number_of_blocks
);
auto
nested_temp_storage_size
=
storage_size
-
(
number_of_blocks
*
sizeof
(
real_init_value_type
));
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
auto
error
=
scan_impl
<
false
,
config
>
(
nested_temp_storage
,
nested_temp_storage_size
,
block_prefixes
,
// input
block_prefixes
,
// output
real_init_value_type
(),
// dummy initial value
number_of_blocks
,
// input size
scan_op
,
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
ROCPRIM_DETAIL_HIP_SYNC
(
"nested_device_scan"
,
number_of_blocks
,
start
);
}
// Grid size for final_scan_kernel
grid_size
=
number_of_blocks
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
detail
::
final_scan_kernel
<
Exclusive
,
// flag for exclusive scan operation
config
,
// kernel configuration (block size, ipt)
InputIterator
,
OutputIterator
,
BinaryFunction
,
InitValueType
>
<<<
dim3
(
grid_size
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
+
offset
,
current_size
,
output
+
offset
,
initial_value
,
scan_op
,
block_prefixes
,
previous_last_element
,
new_last_element
,
i
!=
size_t
(
0
)
&&
((
!
Exclusive
&&
number_of_blocks
==
1
)
||
Exclusive
),
number_of_launch
>
1
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"final_scan_kernel"
,
size
,
start
);
// Swap the last_elements if it's necessary
if
(
number_of_launch
>
1
)
{
cudaError_t
error
=
::
rocprim
::
transform
(
new_last_element
,
previous_last_element
,
1
,
::
rocprim
::
identity
<
real_init_value_type
>
(),
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
}
}
else
{
if
(
debug_synchronous
)
{
std
::
cout
<<
"block_size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"items_per_block "
<<
items_per_block
<<
'\n'
;
}
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
detail
::
single_scan_kernel
<
Exclusive
,
// flag for exclusive scan operation
config
,
// kernel configuration (block size, ipt)
InputIterator
,
OutputIterator
,
BinaryFunction
>
<<<
dim3
(
1
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
,
size
,
initial_value
,
output
,
scan_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"single_scan_kernel"
,
size
,
start
);
}
return
cudaSuccess
;
}
template
<
bool
Exclusive
,
class
Config
,
class
InputIterator
,
class
OutputIterator
,
class
InitValueType
,
class
BinaryFunction
>
inline
auto
scan_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
const
InitValueType
initial_value
,
const
size_t
size
,
BinaryFunction
scan_op
,
const
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
Config
::
use_lookback
,
cudaError_t
>::
type
{
using
config
=
Config
;
using
real_init_value_type
=
input_type_t
<
InitValueType
>
;
using
scan_state_type
=
detail
::
lookback_scan_state
<
real_init_value_type
>
;
using
scan_state_with_sleep_type
=
detail
::
lookback_scan_state
<
real_init_value_type
,
true
>
;
using
ordered_block_id_type
=
detail
::
ordered_block_id
<
unsigned
int
>
;
constexpr
unsigned
int
block_size
=
config
::
block_size
;
constexpr
unsigned
int
items_per_thread
=
config
::
items_per_thread
;
constexpr
auto
items_per_block
=
block_size
*
items_per_thread
;
static
constexpr
size_t
size_limit
=
config
::
size_limit
;
static
constexpr
size_t
aligned_size_limit
=
::
rocprim
::
max
<
size_t
>
(
size_limit
-
size_limit
%
items_per_block
,
items_per_block
);
size_t
limited_size
=
std
::
min
<
size_t
>
(
size
,
aligned_size_limit
);
const
bool
use_limited_size
=
limited_size
==
aligned_size_limit
;
unsigned
int
number_of_blocks
=
(
limited_size
+
items_per_block
-
1
)
/
items_per_block
;
// Calculate required temporary storage
size_t
scan_state_bytes
=
::
rocprim
::
detail
::
align_size
(
// This is valid even with scan_state_with_sleep_type
scan_state_type
::
get_storage_size
(
number_of_blocks
)
);
size_t
ordered_block_id_bytes
=
ordered_block_id_type
::
get_storage_size
();
if
(
temporary_storage
==
nullptr
)
{
// storage_size is never zero
storage_size
=
scan_state_bytes
+
ordered_block_id_bytes
;
if
(
use_limited_size
)
storage_size
+=
2
*
sizeof
(
real_init_value_type
);
return
cudaSuccess
;
}
// Start point for time measurements
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
number_of_blocks
==
0u
)
return
cudaSuccess
;
if
(
number_of_blocks
>
1
||
use_limited_size
)
{
// Create and initialize lookback_scan_state obj
auto
scan_state
=
scan_state_type
::
create
(
temporary_storage
,
number_of_blocks
);
auto
scan_state_with_sleep
=
scan_state_with_sleep_type
::
create
(
temporary_storage
,
number_of_blocks
);
// Create ad initialize ordered_block_id obj
auto
ptr
=
reinterpret_cast
<
char
*>
(
temporary_storage
);
auto
ordered_bid
=
ordered_block_id_type
::
create
(
reinterpret_cast
<
ordered_block_id_type
::
id_type
*>
(
ptr
+
scan_state_bytes
)
);
// The last element
real_init_value_type
*
previous_last_element
=
nullptr
;
real_init_value_type
*
new_last_element
=
nullptr
;
if
(
use_limited_size
)
{
ptr
+=
storage_size
-
sizeof
(
real_init_value_type
);
new_last_element
=
reinterpret_cast
<
real_init_value_type
*>
(
ptr
);
ptr
-=
sizeof
(
real_init_value_type
);
previous_last_element
=
reinterpret_cast
<
real_init_value_type
*>
(
ptr
);
}
cudaDeviceProp
prop
;
int
deviceId
;
static_cast
<
void
>
(
cudaGetDevice
(
&
deviceId
));
static_cast
<
void
>
(
cudaGetDeviceProperties
(
&
prop
,
deviceId
));
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
int
asicRevision
=
0
;
size_t
number_of_launch
=
(
size
+
limited_size
-
1
)
/
limited_size
;
for
(
size_t
i
=
0
,
offset
=
0
;
i
<
number_of_launch
;
i
++
,
offset
+=
limited_size
)
{
size_t
current_size
=
std
::
min
<
size_t
>
(
size
-
offset
,
limited_size
);
number_of_blocks
=
(
current_size
+
items_per_block
-
1
)
/
items_per_block
;
auto
grid_size
=
(
number_of_blocks
+
block_size
-
1
)
/
block_size
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"use_limited_size "
<<
use_limited_size
<<
'\n'
;
std
::
cout
<<
"aligned_size_limit "
<<
aligned_size_limit
<<
'\n'
;
std
::
cout
<<
"number_of_launch "
<<
number_of_launch
<<
'\n'
;
std
::
cout
<<
"index "
<<
i
<<
'\n'
;
std
::
cout
<<
"size "
<<
current_size
<<
'\n'
;
std
::
cout
<<
"block_size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"items_per_block "
<<
items_per_block
<<
'\n'
;
}
init_lookback_scan_state_kernel
<
scan_state_type
>
<<<
dim3
(
grid_size
),
dim3
(
block_size
),
0
,
stream
>>>
(
scan_state
,
number_of_blocks
,
ordered_bid
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"init_lookback_scan_state_kernel"
,
number_of_blocks
,
start
)
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
grid_size
=
number_of_blocks
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"use_limited_size "
<<
use_limited_size
<<
'\n'
;
std
::
cout
<<
"aligned_size_limit "
<<
aligned_size_limit
<<
'\n'
;
std
::
cout
<<
"size "
<<
current_size
<<
'\n'
;
std
::
cout
<<
"block_size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"items_per_block "
<<
items_per_block
<<
'\n'
;
}
lookback_scan_kernel
<
Exclusive
,
// flag for exclusive scan operation
config
,
// kernel configuration (block size, ipt)
InputIterator
,
OutputIterator
,
BinaryFunction
,
InitValueType
,
scan_state_type
>
<<<
dim3
(
grid_size
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
+
offset
,
output
+
offset
,
current_size
,
initial_value
,
scan_op
,
scan_state
,
number_of_blocks
,
ordered_bid
,
previous_last_element
,
new_last_element
,
i
!=
size_t
(
0
),
number_of_launch
>
1
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"lookback_scan_kernel"
,
current_size
,
start
)
// Swap the last_elements
if
(
number_of_launch
>
1
)
{
cudaError_t
error
=
::
rocprim
::
transform
(
new_last_element
,
previous_last_element
,
1
,
::
rocprim
::
identity
<
real_init_value_type
>
(),
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
}
}
else
{
if
(
debug_synchronous
)
{
std
::
cout
<<
"size "
<<
size
<<
'\n'
;
std
::
cout
<<
"block_size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"items_per_block "
<<
items_per_block
<<
'\n'
;
}
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
single_scan_kernel
<
Exclusive
,
// flag for exclusive scan operation
config
,
// kernel configuration (block size, ipt)
InputIterator
,
OutputIterator
,
BinaryFunction
>
<<<
dim3
(
1
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
,
size
,
initial_value
,
output
,
scan_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"single_scan_kernel"
,
size
,
start
);
}
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
#undef ROCPRIM_DETAIL_HIP_SYNC
}
// end of detail namespace
/// \brief Parallel inclusive scan primitive for device level.
///
/// inclusive_scan function performs a device-wide inclusive prefix scan operation
/// using binary \p scan_op operator.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p output must have at least \p size elements.
/// * By default, the input type is used for accumulation. A custom type
/// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range. It can be
/// same as \p input.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level inclusive sum operation is performed on an array of
/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size, rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size, rocprim::plus<int>()
/// );
/// // output: [1, 3, 6, 10, 15, 21, 28, 36]
/// \endcode
///
/// The same example as above, but now a custom accumulator type is specified.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// size_t input_size;
/// short * input;
/// int * output;
///
/// // Use a transform iterator to specifiy a custom accumulator type
/// auto input_iterator = rocprim::make_transform_iterator(
/// input, [] __device__ (T in) { return static_cast<int>(in); });
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Use the transform iterator
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, input_size, rocprim::plus<int>()
/// );
///
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input_iterator, output, input_size, rocprim::plus<int>()
/// );
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
>
inline
cudaError_t
inclusive_scan
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
const
size_t
size
,
BinaryFunction
scan_op
=
BinaryFunction
(),
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
// Get default config if Config is default_config
using
config
=
detail
::
default_or_custom_config
<
Config
,
detail
::
default_scan_config
<
ROCPRIM_TARGET_ARCH
,
input_type
>
>
;
return
detail
::
scan_impl
<
false
,
config
>
(
temporary_storage
,
storage_size
,
// input_type() is a dummy initial value (not used)
input
,
output
,
input_type
(),
size
,
scan_op
,
stream
,
debug_synchronous
);
}
/// \brief Parallel exclusive scan primitive for device level.
///
/// exclusive_scan function performs a device-wide exclusive prefix scan operation
/// using binary \p scan_op operator.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p output must have at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range. It can be
/// same as \p input.
/// \param [in] initial_value - initial value to start the scan.
/// A rocpim::future_value may be passed to use a value that will be later computed.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level exclusive min-scan operation is performed on an array of
/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom scan function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 8 elements
/// int start_value; // e.g., 9
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, start_value, input_size, min_op
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, start_value, input_size, min_op
/// );
/// // output: [9, 4, 7, 6, 2, 2, 1, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
InitValueType
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
>
inline
cudaError_t
exclusive_scan
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
const
InitValueType
initial_value
,
const
size_t
size
,
BinaryFunction
scan_op
=
BinaryFunction
(),
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
using
real_init_value_type
=
detail
::
input_type_t
<
InitValueType
>
;
// Get default config if Config is default_config
using
config
=
detail
::
default_or_custom_config
<
Config
,
detail
::
default_scan_config
<
ROCPRIM_TARGET_ARCH
,
real_init_value_type
>
>
;
return
detail
::
scan_impl
<
true
,
config
>
(
temporary_storage
,
storage_size
,
input
,
output
,
initial_value
,
size
,
scan_op
,
stream
,
debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
3rdparty/cub/rocprim/device/device_scan_by_key.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_
#define ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_
#include "detail/device_scan_by_key.hpp"
#include "detail/lookback_scan_state.hpp"
#include "detail/ordered_block_id.hpp"
#include "config_types.hpp"
#include "device_scan_by_key_config.hpp"
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../functional.hpp"
#include "../types/future_value.hpp"
#include "../types/tuple.hpp"
#include <cuda_runtime.h>
#include <iostream>
#include <iterator>
#include <type_traits>
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
template
<
bool
Exclusive
,
typename
Config
,
typename
KeyInputIterator
,
typename
InputIterator
,
typename
OutputIterator
,
typename
InitialValueType
,
typename
CompareFunction
,
typename
BinaryFunction
,
typename
LookbackScanState
,
typename
ResultType
>
void
__global__
__launch_bounds__
(
Config
::
block_size
)
device_scan_by_key_kernel
(
const
KeyInputIterator
keys
,
const
InputIterator
values
,
const
OutputIterator
output
,
const
InitialValueType
initial_value
,
const
CompareFunction
compare
,
const
BinaryFunction
scan_op
,
const
LookbackScanState
scan_state
,
const
size_t
size
,
const
size_t
starting_block
,
const
size_t
number_of_blocks
,
const
ordered_block_id
<
unsigned
int
>
ordered_bid
,
const
::
rocprim
::
tuple
<
ResultType
,
bool
>*
const
previous_last_value
)
{
device_scan_by_key_kernel_impl
<
Exclusive
,
Config
>
(
keys
,
values
,
output
,
get_input_value
(
initial_value
),
compare
,
scan_op
,
scan_state
,
size
,
starting_block
,
number_of_blocks
,
ordered_bid
,
previous_last_value
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
do \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) \
return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) \
return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
} while(false)
template
<
bool
Exclusive
,
typename
Config
,
typename
KeysInputIterator
,
typename
InputIterator
,
typename
OutputIterator
,
typename
InitValueType
,
typename
BinaryFunction
,
typename
CompareFunction
>
inline
cudaError_t
scan_by_key_impl
(
void
*
const
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys
,
InputIterator
input
,
OutputIterator
output
,
const
InitValueType
initial_value
,
const
size_t
size
,
const
BinaryFunction
scan_op
,
const
CompareFunction
compare
,
const
cudaStream_t
stream
,
const
bool
debug_synchronous
)
{
using
config
=
Config
;
using
real_init_value_type
=
input_type_t
<
InitValueType
>
;
using
wrapped_type
=
::
rocprim
::
tuple
<
real_init_value_type
,
bool
>
;
using
scan_state_type
=
detail
::
lookback_scan_state
<
wrapped_type
>
;
using
scan_state_with_sleep_type
=
detail
::
lookback_scan_state
<
wrapped_type
,
true
>
;
using
ordered_block_id_type
=
detail
::
ordered_block_id
<
unsigned
int
>
;
constexpr
unsigned
int
block_size
=
config
::
block_size
;
constexpr
unsigned
int
items_per_thread
=
config
::
items_per_thread
;
constexpr
auto
items_per_block
=
block_size
*
items_per_thread
;
static
constexpr
size_t
size_limit
=
config
::
size_limit
;
static
constexpr
size_t
aligned_size_limit
=
::
rocprim
::
max
<
size_t
>
(
size_limit
-
size_limit
%
items_per_block
,
items_per_block
);
const
size_t
limited_size
=
std
::
min
<
size_t
>
(
size
,
aligned_size_limit
);
const
bool
use_limited_size
=
limited_size
==
aligned_size_limit
;
// Number of blocks in a single launch (or the only launch if it fits)
const
unsigned
int
number_of_blocks
=
ceiling_div
(
limited_size
,
items_per_block
);
// Calculate required temporary storage, this is valid even with scan_state_with_sleep_type
const
size_t
scan_state_bytes
=
align_size
(
scan_state_type
::
get_storage_size
(
number_of_blocks
));
if
(
temporary_storage
==
nullptr
)
{
const
size_t
ordered_block_id_bytes
=
align_size
(
ordered_block_id_type
::
get_storage_size
(),
alignof
(
wrapped_type
));
// storage_size is never zero
storage_size
=
scan_state_bytes
+
ordered_block_id_bytes
+
(
use_limited_size
?
sizeof
(
wrapped_type
)
:
0
);
return
cudaSuccess
;
}
if
(
number_of_blocks
==
0u
)
{
return
cudaSuccess
;
}
bool
use_sleep
;
if
(
const
cudaError_t
error
=
is_sleep_scan_state_used
(
use_sleep
))
{
return
error
;
}
// Call the provided function with either scan_state or scan_state_with_sleep based on
// the value of use_sleep_scan_state
auto
with_scan_state
=
[
use_sleep
,
scan_state
=
scan_state_type
::
create
(
temporary_storage
,
number_of_blocks
),
scan_state_with_sleep
=
scan_state_with_sleep_type
::
create
(
temporary_storage
,
number_of_blocks
)](
auto
&&
func
)
mutable
->
decltype
(
auto
)
{
if
(
use_sleep
)
{
return
func
(
scan_state_with_sleep
);
}
else
{
return
func
(
scan_state
);
}
};
// Create and initialize ordered_block_id obj
auto
*
const
ptr
=
static_cast
<
char
*>
(
temporary_storage
);
const
auto
ordered_bid
=
ordered_block_id_type
::
create
(
reinterpret_cast
<
ordered_block_id_type
::
id_type
*>
(
ptr
+
scan_state_bytes
));
// The last element
auto
*
const
previous_last_value
=
use_limited_size
?
reinterpret_cast
<
wrapped_type
*>
(
ptr
+
storage_size
-
sizeof
(
wrapped_type
))
:
nullptr
;
// Total number of blocks in all launches
const
auto
total_number_of_blocks
=
ceiling_div
(
size
,
items_per_block
);
const
size_t
number_of_launch
=
ceiling_div
(
size
,
limited_size
);
if
(
debug_synchronous
)
{
std
::
cout
<<
"----------------------------------
\n
"
;
std
::
cout
<<
"size: "
<<
size
<<
'\n'
;
std
::
cout
<<
"aligned_size_limit: "
<<
aligned_size_limit
<<
'\n'
;
std
::
cout
<<
"use_limited_size: "
<<
std
::
boolalpha
<<
use_limited_size
<<
'\n'
;
std
::
cout
<<
"number_of_launch: "
<<
number_of_launch
<<
'\n'
;
std
::
cout
<<
"block_size: "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"items_per_block: "
<<
items_per_block
<<
'\n'
;
std
::
cout
<<
"----------------------------------
\n
"
;
}
for
(
size_t
i
=
0
,
offset
=
0
;
i
<
number_of_launch
;
i
++
,
offset
+=
limited_size
)
{
const
size_t
current_size
=
std
::
min
<
size_t
>
(
size
-
offset
,
limited_size
);
const
auto
scan_blocks
=
ceiling_div
(
current_size
,
items_per_block
);
const
auto
init_grid_size
=
ceiling_div
(
scan_blocks
,
block_size
);
// Start point for time measurements
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"index: "
<<
i
<<
'\n'
;
std
::
cout
<<
"current_size: "
<<
current_size
<<
'\n'
;
std
::
cout
<<
"number of blocks: "
<<
scan_blocks
<<
'\n'
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
with_scan_state
([
&
](
const
auto
scan_state
)
{
init_lookback_scan_state_kernel
<<<
dim3
(
init_grid_size
),
dim3
(
block_size
),
0
,
stream
>>>
(
scan_state
,
scan_blocks
,
ordered_bid
,
number_of_blocks
-
1
,
i
>
0
?
previous_last_value
:
nullptr
);
});
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"init_lookback_scan_state_kernel"
,
scan_blocks
,
start
);
if
(
debug_synchronous
)
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
with_scan_state
([
&
](
auto
&
scan_state
)
{
device_scan_by_key_kernel
<
Exclusive
,
config
><<<
dim3
(
scan_blocks
),
dim3
(
block_size
),
0
,
stream
>>>
(
keys
+
offset
,
input
+
offset
,
output
+
offset
,
initial_value
,
compare
,
scan_op
,
scan_state
,
size
,
i
*
number_of_blocks
,
total_number_of_blocks
,
ordered_bid
,
i
>
0
?
previous_last_value
:
nullptr
);
});
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"device_scan_by_key_kernel"
,
current_size
,
start
);
}
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
}
/// \addtogroup devicemodule
/// @{
/// \brief Parallel inclusive scan-by-key primitive for device level.
///
/// inclusive_scan_by_key function performs a device-wide inclusive prefix scan-by-key
/// operation using binary \p scan_op operator.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have
/// at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
/// \tparam KeyCompareFunction - type of binary function used to determine keys equality. Default type
/// is \p rocprim::equal_to<T>, where \p T is a \p value_type of \p KeysInputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - iterator to the first element in the range of keys.
/// \param [in] values_input - iterator to the first element in the range of values to scan.
/// \param [out] values_output - iterator to the first element in the output value range.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scanning
/// input values.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] key_compare_op - binary operation function object that will be used to determine keys equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is KeyCompareFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level inclusive sum-by-key operation is performed on an array of
/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// int * keys_input; // e.g., [1, 1, 2, 2, 3, 3, 3, 5]
/// short * values_input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::inclusive_scan_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input,
/// values_output, size,
/// rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan-by-key
/// rocprim::inclusive_scan_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input,
/// values_output, size,
/// rocprim::plus<int>()
/// );
/// // values_output: [1, 2, 3, 7, 5, 11, 18, 8]
/// \endcode
/// \endparblock
template
<
typename
Config
=
default_config
,
typename
KeysInputIterator
,
typename
ValuesInputIterator
,
typename
ValuesOutputIterator
,
typename
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
ValuesInputIterator
>
::
value_type
>
,
typename
KeyCompareFunction
=
::
rocprim
::
equal_to
<
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
>>
inline
cudaError_t
inclusive_scan_by_key
(
void
*
const
temporary_storage
,
size_t
&
storage_size
,
const
KeysInputIterator
keys_input
,
const
ValuesInputIterator
values_input
,
const
ValuesOutputIterator
values_output
,
const
size_t
size
,
const
BinaryFunction
scan_op
=
BinaryFunction
(),
const
KeyCompareFunction
key_compare_op
=
KeyCompareFunction
(),
const
cudaStream_t
stream
=
0
,
const
bool
debug_synchronous
=
false
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
;
// Get default config if Config is default_config
using
config
=
detail
::
default_or_custom_config
<
Config
,
detail
::
default_scan_by_key_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
value_type
>>
;
return
detail
::
scan_by_key_impl
<
false
,
config
>
(
temporary_storage
,
storage_size
,
keys_input
,
values_input
,
values_output
,
value_type
(),
size
,
scan_op
,
key_compare_op
,
stream
,
debug_synchronous
);
}
/// \brief Parallel exclusive scan-by-key primitive for device level.
///
/// inclusive_scan_by_key function performs a device-wide exclusive prefix scan-by-key
/// operation using binary \p scan_op operator.
///
/// \par Overview
/// * Supports non-commutative scan operators. However, a scan operator should be
/// associative. When used with non-associative functions the results may be non-deterministic
/// and/or vary in precision.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have
/// at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for scan. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
/// \tparam KeyCompareFunction - type of binary function used to determine keys equality. Default type
/// is \p rocprim::equal_to<T>, where \p T is a \p value_type of \p KeysInputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - iterator to the first element in the range of keys.
/// \param [in] values_input - iterator to the first element in the range of values to scan.
/// \param [out] values_output - iterator to the first element in the output value range.
/// \param [in] initial_value - initial value to start the scan.
/// A rocpim::future_value may be passed to use a value that will be later computed.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scanning
/// input values.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is BinaryFunction().
/// \param [in] key_compare_op - binary operation function object that will be used to determine keys equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// Default is KeyCompareFunction().
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level inclusive sum-by-key operation is performed on an array of
/// integer values (<tt>short</tt>s are scanned into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// int * keys_input; // e.g., [1, 1, 1, 2, 2, 3, 3, 4]
/// short * values_input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int start_value; // e.g., 9
/// int * values_output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::exclusive_scan_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input,
/// values_output, start_value,
/// size,rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan-by-key
/// rocprim::exclusive_scan_by_key(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, values_input,
/// values_output, start_value,
/// size,rocprim::plus<int>()
/// );
/// // values_output: [9, 10, 12, 9, 13, 9, 15, 9]
/// \endcode
/// \endparblock
template
<
typename
Config
=
default_config
,
typename
KeysInputIterator
,
typename
ValuesInputIterator
,
typename
ValuesOutputIterator
,
typename
InitialValueType
,
typename
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
ValuesInputIterator
>
::
value_type
>
,
typename
KeyCompareFunction
=
::
rocprim
::
equal_to
<
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
>>
inline
cudaError_t
exclusive_scan_by_key
(
void
*
const
temporary_storage
,
size_t
&
storage_size
,
const
KeysInputIterator
keys_input
,
const
ValuesInputIterator
values_input
,
const
ValuesOutputIterator
values_output
,
const
InitialValueType
initial_value
,
const
size_t
size
,
const
BinaryFunction
scan_op
=
BinaryFunction
(),
const
KeyCompareFunction
key_compare_op
=
KeyCompareFunction
(),
const
cudaStream_t
stream
=
0
,
const
bool
debug_synchronous
=
false
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
real_init_value_type
=
detail
::
input_type_t
<
InitialValueType
>
;
// Get default config if Config is default_config
using
config
=
detail
::
default_or_custom_config
<
Config
,
detail
::
default_scan_by_key_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
real_init_value_type
>
>
;
return
detail
::
scan_by_key_impl
<
true
,
config
>
(
temporary_storage
,
storage_size
,
keys_input
,
values_input
,
values_output
,
initial_value
,
size
,
scan_op
,
key_compare_op
,
stream
,
debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_HPP_
3rdparty/cub/rocprim/device/device_scan_by_key_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level scan-by-key operation.
///
/// \tparam BlockSize - number of threads in a block.
/// \tparam ItemsPerThread - number of items processed by each thread.
/// \tparam UseLookback - whether to use lookback scan or reduce-then-scan algorithm.
/// \tparam BlockLoadMethod - method for loading input values.
/// \tparam StoreLoadMethod - method for storing values.
/// \tparam BlockScanMethod - algorithm for block scan.
/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch.
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
bool
UseLookback
,
::
rocprim
::
block_load_method
BlockLoadMethod
,
::
rocprim
::
block_store_method
BlockStoreMethod
,
::
rocprim
::
block_scan_algorithm
BlockScanMethod
,
unsigned
int
SizeLimit
=
ROCPRIM_GRID_SIZE_LIMIT
>
struct
scan_by_key_config
{
/// \brief Number of threads in a block.
static
constexpr
unsigned
int
block_size
=
BlockSize
;
/// \brief Number of items processed by each thread.
static
constexpr
unsigned
int
items_per_thread
=
ItemsPerThread
;
/// \brief Whether to use lookback scan or reduce-then-scan algorithm.
static
constexpr
bool
use_lookback
=
UseLookback
;
/// \brief Method for loading input values.
static
constexpr
::
rocprim
::
block_load_method
block_load_method
=
BlockLoadMethod
;
/// \brief Method for storing values.
static
constexpr
::
rocprim
::
block_store_method
block_store_method
=
BlockStoreMethod
;
/// \brief Algorithm for block scan.
static
constexpr
::
rocprim
::
block_scan_algorithm
block_scan_method
=
BlockScanMethod
;
/// \brief Limit on the number of items for a single scan kernel launch.
static
constexpr
unsigned
int
size_limit
=
SizeLimit
;
};
namespace
detail
{
template
<
class
Key
,
class
Value
>
struct
scan_by_key_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
)
+
sizeof
(
Value
),
2
*
sizeof
(
int
));
using
type
=
scan_config
<
limit_block_size
<
256U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_store_method
::
block_store_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
class
Key
,
class
Value
>
struct
scan_by_key_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
)
+
sizeof
(
Value
),
2
*
sizeof
(
int
));
using
type
=
scan_config
<
limit_block_size
<
256U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_store_method
::
block_store_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
class
Key
,
class
Value
>
struct
scan_by_key_config_908
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
)
+
sizeof
(
Value
),
2
*
sizeof
(
int
));
using
type
=
scan_config
<
limit_block_size
<
256U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
20u
/
item_scale
),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_store_method
::
block_store_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
// TODO: We need to update these parameters
template
<
class
Key
,
class
Value
>
struct
scan_by_key_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
)
+
sizeof
(
Value
),
2
*
sizeof
(
int
));
using
type
=
scan_config
<
limit_block_size
<
256U
,
sizeof
(
Key
)
+
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_32
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_store_method
::
block_store_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
unsigned
int
TargetArch
,
class
Key
,
class
Value
>
struct
default_scan_by_key_config
:
select_arch
<
TargetArch
,
select_arch_case
<
900
,
scan_by_key_config_900
<
Key
,
Value
>>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
scan_by_key_config_90a
<
Key
,
Value
>>
,
select_arch_case
<
908
,
scan_by_key_config_908
<
Key
,
Value
>>
,
select_arch_case
<
1030
,
scan_by_key_config_1030
<
Key
,
Value
>>
,
scan_by_key_config_900
<
Key
,
Value
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_SCAN_BY_KEY_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_scan_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SCAN_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_SCAN_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../block/block_load.hpp"
#include "../block/block_store.hpp"
#include "../block/block_scan.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level scan primitives.
///
/// \tparam BlockSize - number of threads in a block.
/// \tparam ItemsPerThread - number of items processed by each thread.
/// \tparam UseLookback - whether to use lookback scan or reduce-then-scan algorithm.
/// \tparam BlockLoadMethod - method for loading input values.
/// \tparam StoreLoadMethod - method for storing values.
/// \tparam BlockScanMethod - algorithm for block scan.
/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch.
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
bool
UseLookback
,
::
rocprim
::
block_load_method
BlockLoadMethod
,
::
rocprim
::
block_store_method
BlockStoreMethod
,
::
rocprim
::
block_scan_algorithm
BlockScanMethod
,
unsigned
int
SizeLimit
=
ROCPRIM_GRID_SIZE_LIMIT
>
struct
scan_config
{
/// \brief Number of threads in a block.
static
constexpr
unsigned
int
block_size
=
BlockSize
;
/// \brief Number of items processed by each thread.
static
constexpr
unsigned
int
items_per_thread
=
ItemsPerThread
;
/// \brief Whether to use lookback scan or reduce-then-scan algorithm.
static
constexpr
bool
use_lookback
=
UseLookback
;
/// \brief Method for loading input values.
static
constexpr
::
rocprim
::
block_load_method
block_load_method
=
BlockLoadMethod
;
/// \brief Method for storing values.
static
constexpr
::
rocprim
::
block_store_method
block_store_method
=
BlockStoreMethod
;
/// \brief Algorithm for block scan.
static
constexpr
::
rocprim
::
block_scan_algorithm
block_scan_method
=
BlockScanMethod
;
/// \brief Limit on the number of items for a single scan kernel launch.
static
constexpr
unsigned
int
size_limit
=
SizeLimit
;
};
namespace
detail
{
template
<
class
Value
>
struct
scan_config_803
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
scan_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_store_method
::
block_store_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
class
Value
>
struct
scan_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
scan_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_store_method
::
block_store_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
// TODO: We need to update these parameters
template
<
class
Value
>
struct
scan_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
scan_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_store_method
::
block_store_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
class
Value
>
struct
scan_config_908
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
scan_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
20u
/
item_scale
),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_store_method
::
block_store_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
// TODO: We need to update these parameters
template
<
class
Value
>
struct
scan_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
scan_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_32
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
),
ROCPRIM_DETAIL_USE_LOOKBACK_SCAN
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_store_method
::
block_store_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
unsigned
int
TargetArch
,
class
Value
>
struct
default_scan_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
scan_config_803
<
Value
>>
,
select_arch_case
<
900
,
scan_config_900
<
Value
>>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
scan_config_90a
<
Value
>>
,
select_arch_case
<
908
,
scan_config_908
<
Value
>>
,
select_arch_case
<
1030
,
scan_config_1030
<
Value
>>
,
scan_config_900
<
Value
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_SCAN_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_segmented_radix_sort.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
#include <iostream>
#include <iterator>
#include <type_traits>
#include <utility>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/radix_sort.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "../block/block_load.hpp"
#include "../iterator/counting_iterator.hpp"
#include "../iterator/reverse_iterator.hpp"
#include "detail/device_segmented_radix_sort.hpp"
#include "device_partition.hpp"
#include "device_segmented_radix_sort_config.hpp"
/// \addtogroup devicemodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
template
<
class
Config
,
bool
Descending
,
unsigned
int
BlockSize
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
OffsetIterator
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
segmented_sort_kernel
(
KeysInputIterator
keys_input
,
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
*
keys_tmp
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
*
values_tmp
,
ValuesOutputIterator
values_output
,
bool
to_output
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
long_iterations
,
unsigned
int
short_iterations
,
unsigned
int
begin_bit
,
unsigned
int
end_bit
)
{
segmented_sort
<
Config
,
Descending
>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
to_output
,
begin_offsets
,
end_offsets
,
long_iterations
,
short_iterations
,
begin_bit
,
end_bit
);
}
template
<
class
Config
,
bool
Descending
,
unsigned
int
BlockSize
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
SegmentIndexIterator
,
class
OffsetIterator
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
segmented_sort_large_kernel
(
KeysInputIterator
keys_input
,
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
*
keys_tmp
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
*
values_tmp
,
ValuesOutputIterator
values_output
,
bool
to_output
,
SegmentIndexIterator
segment_indices
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
long_iterations
,
unsigned
int
short_iterations
,
unsigned
int
begin_bit
,
unsigned
int
end_bit
)
{
segmented_sort_large
<
Config
,
Descending
>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
to_output
,
segment_indices
,
begin_offsets
,
end_offsets
,
long_iterations
,
short_iterations
,
begin_bit
,
end_bit
);
}
template
<
class
Config
,
bool
Descending
,
unsigned
int
BlockSize
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
SegmentIndexIterator
,
class
OffsetIterator
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
segmented_sort_small_or_medium_kernel
(
KeysInputIterator
keys_input
,
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
*
keys_tmp
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
*
values_tmp
,
ValuesOutputIterator
values_output
,
bool
to_output
,
unsigned
int
num_segments
,
SegmentIndexIterator
segment_indices
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
,
unsigned
int
end_bit
)
{
segmented_sort_small
<
Config
,
Descending
>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
to_output
,
num_segments
,
segment_indices
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
struct
TwoWayPartitioner
{
template
<
typename
InputIterator
,
typename
FirstOutputIterator
,
typename
SecondOutputIterator
,
typename
UnselectedOutputIterator
,
typename
SelectedCountOutputIterator
,
typename
FirstUnaryPredicate
,
typename
SecondUnaryPredicate
>
cudaError_t
operator
()(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
FirstOutputIterator
output_first_part
,
SecondOutputIterator
/*output_second_part*/
,
UnselectedOutputIterator
/*output_unselected*/
,
SelectedCountOutputIterator
selected_count_output
,
const
size_t
size
,
FirstUnaryPredicate
select_first_part_op
,
SecondUnaryPredicate
/*select_second_part_op*/
,
const
cudaStream_t
stream
,
const
bool
debug_synchronous
)
{
return
partition
(
temporary_storage
,
storage_size
,
input
,
output_first_part
,
selected_count_output
,
size
,
select_first_part_op
,
stream
,
debug_synchronous
);
}
};
struct
ThreeWayPartitioner
{
template
<
typename
InputIterator
,
typename
FirstOutputIterator
,
typename
SecondOutputIterator
,
typename
UnselectedOutputIterator
,
typename
SelectedCountOutputIterator
,
typename
FirstUnaryPredicate
,
typename
SecondUnaryPredicate
>
cudaError_t
operator
()(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
FirstOutputIterator
output_first_part
,
SecondOutputIterator
output_second_part
,
UnselectedOutputIterator
output_unselected
,
SelectedCountOutputIterator
selected_count_output
,
const
size_t
size
,
FirstUnaryPredicate
select_first_part_op
,
SecondUnaryPredicate
select_second_part_op
,
const
cudaStream_t
stream
,
const
bool
debug_synchronous
)
{
return
partition_three_way
(
temporary_storage
,
storage_size
,
input
,
output_first_part
,
output_second_part
,
output_unselected
,
selected_count_output
,
size
,
select_first_part_op
,
select_second_part_op
,
stream
,
debug_synchronous
);
}
};
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
OffsetIterator
>
inline
cudaError_t
segmented_radix_sort_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
*
keys_tmp
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
*
values_tmp
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
bool
&
is_result_in_output
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
;
using
segment_index_type
=
unsigned
int
;
using
segment_index_iterator
=
counting_iterator
<
segment_index_type
>
;
static_assert
(
std
::
is_same
<
key_type
,
typename
std
::
iterator_traits
<
KeysOutputIterator
>::
value_type
>::
value
,
"KeysInputIterator and KeysOutputIterator must have the same value_type"
);
static_assert
(
std
::
is_same
<
value_type
,
typename
std
::
iterator_traits
<
ValuesOutputIterator
>::
value_type
>::
value
,
"ValuesInputIterator and ValuesOutputIterator must have the same value_type"
);
using
config
=
default_or_custom_config
<
Config
,
default_segmented_radix_sort_config
<
ROCPRIM_TARGET_ARCH
,
key_type
,
value_type
>
>
;
static
constexpr
bool
with_values
=
!
std
::
is_same
<
value_type
,
::
rocprim
::
empty_type
>::
value
;
static
constexpr
bool
partitioning_allowed
=
!
std
::
is_same
<
typename
config
::
warp_sort_config
,
DisabledWarpSortConfig
>::
value
;
static
constexpr
unsigned
int
max_small_segment_length
=
config
::
warp_sort_config
::
items_per_thread_small
*
config
::
warp_sort_config
::
logical_warp_size_small
;
static
constexpr
unsigned
int
small_segments_per_block
=
config
::
warp_sort_config
::
block_size_small
/
config
::
warp_sort_config
::
logical_warp_size_small
;
static
constexpr
unsigned
int
max_medium_segment_length
=
config
::
warp_sort_config
::
items_per_thread_medium
*
config
::
warp_sort_config
::
logical_warp_size_medium
;
static
constexpr
unsigned
int
medium_segments_per_block
=
config
::
warp_sort_config
::
block_size_medium
/
config
::
warp_sort_config
::
logical_warp_size_medium
;
static_assert
(
max_small_segment_length
<=
max_medium_segment_length
,
"The max length of small segments cannot be higher than the max length of medium segments"
);
// Don't waste cycles on 3-way partitioning, if the small and medium segments are equal length
static
constexpr
bool
three_way_partitioning
=
max_small_segment_length
<
max_medium_segment_length
;
using
partitioner_type
=
std
::
conditional_t
<
three_way_partitioning
,
ThreeWayPartitioner
,
TwoWayPartitioner
>
;
partitioner_type
partitioner
;
const
auto
large_segment_selector
=
[
=
](
const
unsigned
int
segment_index
)
mutable
->
bool
{
const
unsigned
int
segment_length
=
end_offsets
[
segment_index
]
-
begin_offsets
[
segment_index
];
return
segment_length
>
max_medium_segment_length
;
};
const
auto
medium_segment_selector
=
[
=
](
const
unsigned
int
segment_index
)
mutable
->
bool
{
const
unsigned
int
segment_length
=
end_offsets
[
segment_index
]
-
begin_offsets
[
segment_index
];
return
segment_length
>
max_small_segment_length
;
};
const
bool
with_double_buffer
=
keys_tmp
!=
nullptr
;
const
unsigned
int
bits
=
end_bit
-
begin_bit
;
const
unsigned
int
iterations
=
::
rocprim
::
detail
::
ceiling_div
(
bits
,
config
::
long_radix_bits
);
const
bool
to_output
=
with_double_buffer
||
(
iterations
-
1
)
%
2
==
0
;
is_result_in_output
=
(
iterations
%
2
==
0
)
!=
to_output
;
const
unsigned
int
radix_bits_diff
=
config
::
long_radix_bits
-
config
::
short_radix_bits
;
const
unsigned
int
short_iterations
=
radix_bits_diff
!=
0
?
::
rocprim
::
min
(
iterations
,
(
config
::
long_radix_bits
*
iterations
-
bits
)
/
radix_bits_diff
)
:
0
;
const
unsigned
int
long_iterations
=
iterations
-
short_iterations
;
const
bool
do_partitioning
=
partitioning_allowed
&&
segments
>=
config
::
warp_sort_config
::
partitioning_threshold
;
const
size_t
keys_bytes
=
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
key_type
));
const
size_t
values_bytes
=
with_values
?
::
rocprim
::
detail
::
align_size
(
size
*
sizeof
(
value_type
))
:
0
;
const
size_t
large_and_small_segment_indices_bytes
=
::
rocprim
::
detail
::
align_size
(
segments
*
sizeof
(
segment_index_type
));
const
size_t
medium_segment_indices_bytes
=
three_way_partitioning
?
::
rocprim
::
detail
::
align_size
(
segments
*
sizeof
(
segment_index_type
))
:
0
;
static
constexpr
size_t
segment_count_output_size
=
three_way_partitioning
?
2
:
1
;
const
size_t
segment_count_output_bytes
=
::
rocprim
::
detail
::
align_size
(
segment_count_output_size
*
sizeof
(
segment_index_type
));
segment_index_type
*
large_segment_indices_output
{};
// The total number of large and small segments is not above the number of segments
// The same buffer is filled with the large and small indices from both directions
auto
small_segment_indices_output
=
make_reverse_iterator
(
large_segment_indices_output
+
segments
);
segment_index_type
*
medium_segment_indices_output
{};
segment_index_type
*
segment_count_output
{};
size_t
partition_storage_size
{};
void
*
partition_temporary_storage
{};
if
(
temporary_storage
==
nullptr
)
{
storage_size
=
with_double_buffer
?
0
:
(
keys_bytes
+
values_bytes
);
if
(
do_partitioning
)
{
storage_size
+=
large_and_small_segment_indices_bytes
;
storage_size
+=
medium_segment_indices_bytes
;
storage_size
+=
segment_count_output_bytes
;
const
auto
partition_result
=
partitioner
(
partition_temporary_storage
,
partition_storage_size
,
segment_index_iterator
{},
large_segment_indices_output
,
medium_segment_indices_output
,
small_segment_indices_output
,
segment_count_output
,
segments
,
large_segment_selector
,
medium_segment_selector
,
stream
,
debug_synchronous
);
if
(
cudaSuccess
!=
partition_result
)
{
return
partition_result
;
}
storage_size
+=
partition_storage_size
;
}
// Make sure user won't try to allocate 0 bytes memory, otherwise
// user may again pass nullptr as temporary_storage
storage_size
=
storage_size
==
0
?
4
:
storage_size
;
return
cudaSuccess
;
}
if
(
segments
==
0u
)
{
return
cudaSuccess
;
}
if
(
debug_synchronous
)
{
std
::
cout
<<
"begin_bit "
<<
begin_bit
<<
'\n'
;
std
::
cout
<<
"end_bit "
<<
end_bit
<<
'\n'
;
std
::
cout
<<
"bits "
<<
bits
<<
'\n'
;
std
::
cout
<<
"segments "
<<
segments
<<
'\n'
;
std
::
cout
<<
"radix_bits_diff "
<<
radix_bits_diff
<<
'\n'
;
std
::
cout
<<
"storage_size "
<<
storage_size
<<
'\n'
;
std
::
cout
<<
"iterations "
<<
iterations
<<
'\n'
;
std
::
cout
<<
"long_iterations "
<<
long_iterations
<<
'\n'
;
std
::
cout
<<
"short_iterations "
<<
short_iterations
<<
'\n'
;
std
::
cout
<<
"do_partitioning "
<<
do_partitioning
<<
'\n'
;
std
::
cout
<<
"config::sort::block_size: "
<<
config
::
sort
::
block_size
<<
'\n'
;
std
::
cout
<<
"config::sort::items_per_thread: "
<<
config
::
sort
::
items_per_thread
<<
'\n'
;
cudaError_t
error
=
cudaStreamSynchronize
(
stream
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
char
*
ptr
=
reinterpret_cast
<
char
*>
(
temporary_storage
);
if
(
!
with_double_buffer
)
{
keys_tmp
=
reinterpret_cast
<
key_type
*>
(
ptr
);
ptr
+=
keys_bytes
;
values_tmp
=
with_values
?
reinterpret_cast
<
value_type
*>
(
ptr
)
:
nullptr
;
ptr
+=
values_bytes
;
}
large_segment_indices_output
=
reinterpret_cast
<
segment_index_type
*>
(
ptr
);
ptr
+=
large_and_small_segment_indices_bytes
;
medium_segment_indices_output
=
reinterpret_cast
<
segment_index_type
*>
(
ptr
);
ptr
+=
medium_segment_indices_bytes
;
small_segment_indices_output
=
make_reverse_iterator
(
large_segment_indices_output
+
segments
);
segment_count_output
=
reinterpret_cast
<
segment_index_type
*>
(
ptr
);
ptr
+=
segment_count_output_bytes
;
partition_temporary_storage
=
ptr
;
ptr
+=
partition_storage_size
;
if
(
do_partitioning
)
{
cudaError_t
result
=
partitioner
(
partition_temporary_storage
,
partition_storage_size
,
segment_index_iterator
{},
large_segment_indices_output
,
medium_segment_indices_output
,
small_segment_indices_output
,
segment_count_output
,
segments
,
large_segment_selector
,
medium_segment_selector
,
stream
,
debug_synchronous
);
if
(
cudaSuccess
!=
result
)
{
return
result
;
}
segment_index_type
segment_counts
[
segment_count_output_size
]{};
result
=
cudaMemcpyAsync
(
&
segment_counts
,
segment_count_output
,
segment_count_output_bytes
,
cudaMemcpyDeviceToHost
,
stream
);
if
(
cudaSuccess
!=
result
)
{
return
result
;
}
result
=
cudaStreamSynchronize
(
stream
);
if
(
cudaSuccess
!=
result
)
{
return
result
;
}
const
auto
large_segment_count
=
segment_counts
[
0
];
const
auto
medium_segment_count
=
three_way_partitioning
?
segment_counts
[
1
]
:
0
;
const
auto
small_segment_count
=
segments
-
large_segment_count
-
medium_segment_count
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"large_segment_count "
<<
large_segment_count
<<
'\n'
;
std
::
cout
<<
"medium_segment_count "
<<
medium_segment_count
<<
'\n'
;
std
::
cout
<<
"small_segment_count "
<<
small_segment_count
<<
'\n'
;
}
if
(
large_segment_count
>
0
)
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
segmented_sort_large_kernel
<
config
,
Descending
,
config
::
sort
::
block_size
>
<<<
dim3
(
large_segment_count
),
dim3
(
config
::
sort
::
block_size
),
0
,
stream
>>>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
to_output
,
large_segment_indices_output
,
begin_offsets
,
end_offsets
,
long_iterations
,
short_iterations
,
begin_bit
,
end_bit
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"segmented_sort:large_segments"
,
large_segment_count
,
start
)
}
if
(
three_way_partitioning
&&
medium_segment_count
>
0
)
{
const
auto
medium_segment_grid_size
=
::
rocprim
::
detail
::
ceiling_div
(
medium_segment_count
,
medium_segments_per_block
);
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
segmented_sort_small_or_medium_kernel
<
select_warp_sort_helper_config_medium_t
<
typename
config
::
warp_sort_config
>
,
Descending
,
config
::
warp_sort_config
::
block_size_medium
>
<<<
dim3
(
medium_segment_grid_size
),
dim3
(
config
::
warp_sort_config
::
block_size_medium
),
0
,
stream
>>>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
is_result_in_output
,
medium_segment_count
,
medium_segment_indices_output
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"segmented_sort:medium_segments"
,
medium_segment_count
,
start
)
}
if
(
small_segment_count
>
0
)
{
const
auto
small_segment_grid_size
=
::
rocprim
::
detail
::
ceiling_div
(
small_segment_count
,
small_segments_per_block
);
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
segmented_sort_small_or_medium_kernel
<
select_warp_sort_helper_config_small_t
<
typename
config
::
warp_sort_config
>
,
Descending
,
config
::
warp_sort_config
::
block_size_small
>
<<<
dim3
(
small_segment_grid_size
),
dim3
(
config
::
warp_sort_config
::
block_size_small
),
0
,
stream
>>>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
is_result_in_output
,
small_segment_count
,
small_segment_indices_output
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"segmented_sort:small_segments"
,
small_segment_count
,
start
)
}
}
else
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
segmented_sort_kernel
<
config
,
Descending
,
config
::
sort
::
block_size
>
<<<
dim3
(
segments
),
dim3
(
config
::
sort
::
block_size
),
0
,
stream
>>>
(
keys_input
,
keys_tmp
,
keys_output
,
values_input
,
values_tmp
,
values_output
,
to_output
,
begin_offsets
,
end_offsets
,
long_iterations
,
short_iterations
,
begin_bit
,
end_bit
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"segmented_sort"
,
segments
,
start
)
}
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
}
// end namespace detail
/// \brief Parallel ascending radix sort primitive for device level.
///
/// \p segmented_radix_sort_keys function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of keys. Function sorts input keys in ascending order.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * output; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys_output: [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
OffsetIterator
,
class
Key
=
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
inline
cudaError_t
segmented_radix_sort_keys
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
unsigned
int
size
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
empty_type
*
values
=
nullptr
;
bool
ignored
;
return
detail
::
segmented_radix_sort_impl
<
Config
,
false
>
(
temporary_storage
,
storage_size
,
keys_input
,
nullptr
,
keys_output
,
values
,
nullptr
,
values
,
size
,
ignored
,
segments
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
/// \brief Parallel descending radix sort primitive for device level.
///
/// \p segmented_radix_sort_keys_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of keys. Function sorts input keys in descending order.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input and \p keys_output must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
/// int * output; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys_output: [6, 3, 5, 8, 7, 4, 2, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
OffsetIterator
,
class
Key
=
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
inline
cudaError_t
segmented_radix_sort_keys_desc
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
unsigned
int
size
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
empty_type
*
values
=
nullptr
;
bool
ignored
;
return
detail
::
segmented_radix_sort_impl
<
Config
,
true
>
(
temporary_storage
,
storage_size
,
keys_input
,
nullptr
,
keys_output
,
values
,
nullptr
,
values
,
size
,
ignored
,
segments
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
/// \brief Parallel ascending radix sort-by-key primitive for device level.
///
/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in ascending order of keys.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
/// have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
///
/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
/// // is set to 5.
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output, input_size,
/// segments, offsets, offsets + 1,
/// 0, 5
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output, input_size,
/// segments, offsets, offsets + 1,
/// 0, 5
/// );
/// // keys_output: [3, 6, 5, 1, 1, 4, 7, 8]
/// // values_output: [2, -5, -4, -1, -2, 3, 7, -8]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
OffsetIterator
,
class
Key
=
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
inline
cudaError_t
segmented_radix_sort_pairs
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
bool
ignored
;
return
detail
::
segmented_radix_sort_impl
<
Config
,
false
>
(
temporary_storage
,
storage_size
,
keys_input
,
nullptr
,
keys_output
,
values_input
,
nullptr
,
values_output
,
size
,
ignored
,
segments
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
/// \brief Parallel descending radix sort-by-key primitive for device level.
///
/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in descending order of keys.
///
/// \par Overview
/// * The contents of the inputs are not altered by the sorting function.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * \p Key type (a \p value_type of \p KeysInputIterator and \p KeysOutputIterator) must be
/// an arithmetic type (that is, an integral type or a floating-point type).
/// * Ranges specified by \p keys_input, \p keys_output, \p values_input and \p values_output must
/// have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam ValuesOutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - pointer to the first element in the range to sort.
/// \param [out] keys_output - pointer to the first element in the output range.
/// \param [in] values_input - pointer to the first element in the range to sort.
/// \param [out] values_output - pointer to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed where input keys are
/// represented by an array of integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// int * keys_output; // empty array of 8 elements
/// double * values_output; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys_input, keys_output, values_input, values_output,
/// input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys_output: [ 6, 3, 5, 8, 7, 4, 1, 1]
/// // values_output: [-5, 2, -4, -8, 7, 3, -1, -2]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
OffsetIterator
,
class
Key
=
typename
std
::
iterator_traits
<
KeysInputIterator
>
::
value_type
>
inline
cudaError_t
segmented_radix_sort_pairs_desc
(
void
*
temporary_storage
,
size_t
&
storage_size
,
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
bool
ignored
;
return
detail
::
segmented_radix_sort_impl
<
Config
,
true
>
(
temporary_storage
,
storage_size
,
keys_input
,
nullptr
,
keys_output
,
values_input
,
nullptr
,
values_output
,
size
,
ignored
,
segments
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
}
/// \brief Parallel ascending radix sort primitive for device level.
///
/// \p segmented_radix_sort_keys function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of keys. Function sorts input keys in ascending order.
///
/// \par Overview
/// * The contents of both buffers of \p keys may be altered by the sorting function.
/// * \p current() of \p keys is used as the input.
/// * The function will update \p current() of \p keys to point to the buffer
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed on an array of
/// \p float values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
/// float * tmp; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
/// // Create double-buffer
/// rocprim::double_buffer<float> keys(input, tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_keys(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys.current(): [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
Key
,
class
OffsetIterator
>
inline
cudaError_t
segmented_radix_sort_keys
(
void
*
temporary_storage
,
size_t
&
storage_size
,
double_buffer
<
Key
>&
keys
,
unsigned
int
size
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
empty_type
*
values
=
nullptr
;
bool
is_result_in_output
;
cudaError_t
error
=
detail
::
segmented_radix_sort_impl
<
Config
,
false
>
(
temporary_storage
,
storage_size
,
keys
.
current
(),
keys
.
current
(),
keys
.
alternate
(),
values
,
values
,
values
,
size
,
is_result_in_output
,
segments
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
temporary_storage
!=
nullptr
&&
is_result_in_output
)
{
keys
.
swap
();
}
return
error
;
}
/// \brief Parallel descending radix sort primitive for device level.
///
/// \p segmented_radix_sort_keys_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of keys. Function sorts input keys in descending order.
///
/// \par Overview
/// * The contents of both buffers of \p keys may be altered by the sorting function.
/// * \p current() of \p keys is used as the input.
/// * The function will update \p current() of \p keys to point to the buffer
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed on an array of
/// integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
/// int * tmp; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
/// // Create double-buffer
/// rocprim::double_buffer<int> keys(input, tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_keys_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys.current(): [6, 3, 5, 8, 7, 4, 2, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
Key
,
class
OffsetIterator
>
inline
cudaError_t
segmented_radix_sort_keys_desc
(
void
*
temporary_storage
,
size_t
&
storage_size
,
double_buffer
<
Key
>&
keys
,
unsigned
int
size
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
empty_type
*
values
=
nullptr
;
bool
is_result_in_output
;
cudaError_t
error
=
detail
::
segmented_radix_sort_impl
<
Config
,
true
>
(
temporary_storage
,
storage_size
,
keys
.
current
(),
keys
.
current
(),
keys
.
alternate
(),
values
,
values
,
values
,
size
,
is_result_in_output
,
segments
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
temporary_storage
!=
nullptr
&&
is_result_in_output
)
{
keys
.
swap
();
}
return
error
;
}
/// \brief Parallel ascending radix sort-by-key primitive for device level.
///
/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in ascending order of keys.
///
/// \par Overview
/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
/// * \p current() of \p keys and \p values are used as the input.
/// * The function will update \p current() of \p keys and \p values to point to buffers
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Value - value type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in,out] values - reference to the double-buffer of values, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level ascending radix sort is performed where input keys are
/// represented by an array of unsigned integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// unsigned int * keys_tmp; // empty array of 8 elements
/// double* values_tmp; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
/// // Create double-buffers
/// rocprim::double_buffer<unsigned int> keys(keys_input, keys_tmp);
/// rocprim::double_buffer<double> values(values_input, values_tmp);
///
/// // Keys are in range [0; 8], so we can limit compared bit to bits on indexes
/// // 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
/// // is set to 5.
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// segments, offsets, offsets + 1
/// 0, 5
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_pairs(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// segments, offsets, offsets + 1
/// 0, 5
/// );
/// // keys.current(): [3, 6, 5, 1, 1, 4, 7, 8]
/// // values.current(): [2, -5, -4, -1, -2, 3, 7, -8]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
Key
,
class
Value
,
class
OffsetIterator
>
inline
cudaError_t
segmented_radix_sort_pairs
(
void
*
temporary_storage
,
size_t
&
storage_size
,
double_buffer
<
Key
>&
keys
,
double_buffer
<
Value
>&
values
,
unsigned
int
size
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
bool
is_result_in_output
;
cudaError_t
error
=
detail
::
segmented_radix_sort_impl
<
Config
,
false
>
(
temporary_storage
,
storage_size
,
keys
.
current
(),
keys
.
current
(),
keys
.
alternate
(),
values
.
current
(),
values
.
current
(),
values
.
alternate
(),
size
,
is_result_in_output
,
segments
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
temporary_storage
!=
nullptr
&&
is_result_in_output
)
{
keys
.
swap
();
values
.
swap
();
}
return
error
;
}
/// \brief Parallel descending radix sort-by-key primitive for device level.
///
/// \p segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple,
/// non-overlapping sequences of (key, value) pairs. Function sorts input pairs in descending order of keys.
///
/// \par Overview
/// * The contents of both buffers of \p keys and \p values may be altered by the sorting function.
/// * \p current() of \p keys and \p values are used as the input.
/// * The function will update \p current() of \p keys and \p values to point to buffers
/// that contains the output range.
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * The function requires small \p temporary_storage as it does not need
/// a temporary buffer of \p size elements.
/// * \p Key type must be an arithmetic type (that is, an integral type or a floating-point
/// type).
/// * Buffers of \p keys must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
/// * If \p Key is an integer type and the range of keys is known in advance, the performance
/// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
/// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
///
/// \tparam Config - [optional] configuration of the primitive. It can be
/// \p segmented_radix_sort_config or a custom class with the same members.
/// \tparam Key - key type. Must be an integral type or a floating-point type.
/// \tparam Value - value type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the sort operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in,out] keys - reference to the double-buffer of keys, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in,out] values - reference to the double-buffer of values, its \p current()
/// contains the input range and will be updated to point to the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] begin_bit - [optional] index of the first (least significant) bit used in
/// key comparison. Must be in range <tt>[0; 8 * sizeof(Key))</tt>. Default value: \p 0.
/// Non-default value not supported for floating-point key-types.
/// \param [in] end_bit - [optional] past-the-end index (most significant) bit used in
/// key comparison. Must be in range <tt>(begin_bit; 8 * sizeof(Key)]</tt>. Default
/// value: \p <tt>8 * sizeof(Key)</tt>. Non-default value not supported for floating-point key-types.
/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. Default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful sort; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level descending radix sort is performed where input keys are
/// represented by an array of integers and input values by an array of <tt>double</tt>s.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and tmp (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
/// double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
/// int * keys_tmp; // empty array of 8 elements
/// double * values_tmp; // empty array of 8 elements
/// unsigned int segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 3, 8]
/// // Create double-buffers
/// rocprim::double_buffer<int> keys(keys_input, keys_tmp);
/// rocprim::double_buffer<double> values(values_input, values_tmp);
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// segments, offsets, offsets + 1
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform sort
/// rocprim::segmented_radix_sort_pairs_desc(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// keys, values, input_size,
/// segments, offsets, offsets + 1
/// );
/// // keys.current(): [ 6, 3, 5, 8, 7, 4, 1, 1]
/// // values.current(): [-5, 2, -4, -8, 7, 3, -1, -2]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
Key
,
class
Value
,
class
OffsetIterator
>
inline
cudaError_t
segmented_radix_sort_pairs_desc
(
void
*
temporary_storage
,
size_t
&
storage_size
,
double_buffer
<
Key
>&
keys
,
double_buffer
<
Value
>&
values
,
unsigned
int
size
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
unsigned
int
begin_bit
=
0
,
unsigned
int
end_bit
=
8
*
sizeof
(
Key
),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
bool
is_result_in_output
;
cudaError_t
error
=
detail
::
segmented_radix_sort_impl
<
Config
,
true
>
(
temporary_storage
,
storage_size
,
keys
.
current
(),
keys
.
current
(),
keys
.
alternate
(),
values
.
current
(),
values
.
current
(),
values
.
alternate
(),
size
,
is_result_in_output
,
segments
,
begin_offsets
,
end_offsets
,
begin_bit
,
end_bit
,
stream
,
debug_synchronous
);
if
(
temporary_storage
!=
nullptr
&&
is_result_in_output
)
{
keys
.
swap
();
values
.
swap
();
}
return
error
;
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group devicemodule
#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
3rdparty/cub/rocprim/device/device_segmented_radix_sort_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2020 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_CONFIG_HPP_
#include <algorithm>
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of the warp sort part of the device segmented radix sort operation.
/// Short enough segments are processed on warp level.
///
/// \tparam LogicalWarpSizeSmall - number of threads in the logical warp of the kernel
/// that processes small segments.
/// \tparam ItemsPerThreadSmall - number of items processed by a thread in the kernel that processes
/// small segments.
/// \tparam BlockSizeSmall - number of threads per block in the kernel which processes the small segments.
/// \tparam PartitioningThreshold - if the number of segments is at least this threshold, the
/// segments are partitioned to a small, a medium and a large segment collection. Both collections
/// are sorted by different kernels. Otherwise, all segments are sorted by a single kernel.
/// \tparam EnableUnpartitionedWarpSort - If set to \p true, warp sort can be used to sort
/// the small segments, even if the total number of segments is below \p PartitioningThreshold.
/// \tparam LogicalWarpSizeMedium - number of threads in the logical warp of the kernel
/// that processes medium segments.
/// \tparam ItemsPerThreadMedium - number of items processed by a thread in the kernel that processes
/// medium segments.
/// \tparam BlockSizeMedium - number of threads per block in the kernel which processes the medium segments.
template
<
unsigned
int
LogicalWarpSizeSmall
,
unsigned
int
ItemsPerThreadSmall
,
unsigned
int
BlockSizeSmall
=
256
,
unsigned
int
PartitioningThreshold
=
3000
,
bool
EnableUnpartitionedWarpSort
=
true
,
unsigned
int
LogicalWarpSizeMedium
=
std
::
max
(
32u
,
LogicalWarpSizeSmall
),
unsigned
int
ItemsPerThreadMedium
=
std
::
max
(
4u
,
ItemsPerThreadSmall
),
unsigned
int
BlockSizeMedium
=
256
>
struct
WarpSortConfig
{
static_assert
(
LogicalWarpSizeSmall
*
ItemsPerThreadSmall
<=
LogicalWarpSizeMedium
*
ItemsPerThreadMedium
,
"The number of items processed by a small warp cannot be larger than the number "
"of items processed by a medium warp"
);
/// \brief The number of threads in the logical warp in the small segment processing kernel.
static
constexpr
unsigned
int
logical_warp_size_small
=
LogicalWarpSizeSmall
;
/// \brief The number of items processed by a thread in the small segment processing kernel.
static
constexpr
unsigned
int
items_per_thread_small
=
ItemsPerThreadSmall
;
/// \brief The number of threads per block in the small segment processing kernel.
static
constexpr
unsigned
int
block_size_small
=
BlockSizeSmall
;
/// \brief If the number of segments is at least \p partitioning_threshold, then the segments are partitioned into
/// small and large segment groups, and each group is handled by a different, specialized kernel.
static
constexpr
unsigned
int
partitioning_threshold
=
PartitioningThreshold
;
/// \brief If set to \p true, warp sort can be used to sort the small segments, even if the total number of
/// segments is below \p PartitioningThreshold.
static
constexpr
bool
enable_unpartitioned_warp_sort
=
EnableUnpartitionedWarpSort
;
/// \brief The number of threads in the logical warp in the medium segment processing kernel.
static
constexpr
unsigned
int
logical_warp_size_medium
=
LogicalWarpSizeMedium
;
/// \brief The number of items processed by a thread in the medium segment processing kernel.
static
constexpr
unsigned
int
items_per_thread_medium
=
ItemsPerThreadMedium
;
/// \brief The number of threads per block in the medium segment processing kernel.
static
constexpr
unsigned
int
block_size_medium
=
BlockSizeMedium
;
};
/// \brief Indicates if the warp level sorting is disabled in the
/// device segmented radix sort configuration.
struct
DisabledWarpSortConfig
{
/// \brief The number of threads in the logical warp in the small segment processing kernel.
static
constexpr
unsigned
int
logical_warp_size_small
=
1
;
/// \brief The number of items processed by a thread in the small segment processing kernel.
static
constexpr
unsigned
int
items_per_thread_small
=
1
;
/// \brief The number of threads per block in the small segment processing kernel.
static
constexpr
unsigned
int
block_size_small
=
1
;
/// \brief If the number of segments is at least \p partitioning_threshold, then the segments are partitioned into
/// small and large segment groups, and each group is handled by a different, specialized kernel.
static
constexpr
unsigned
int
partitioning_threshold
=
0
;
/// \brief If set to \p true, warp sort can be used to sort the small segments, even if the total number of
/// segments is below \p PartitioningThreshold.
static
constexpr
bool
enable_unpartitioned_warp_sort
=
false
;
/// \brief The number of threads in the logical warp in the medium segment processing kernel.
static
constexpr
unsigned
int
logical_warp_size_medium
=
1
;
/// \brief The number of items processed by a thread in the medium segment processing kernel.
static
constexpr
unsigned
int
items_per_thread_medium
=
1
;
/// \brief The number of threads per block in the medium segment processing kernel.
static
constexpr
unsigned
int
block_size_medium
=
1
;
};
/// \brief Selects the appropriate \p WarpSortConfig based on the size of the key type.
///
/// \tparam Key - the type of the sorted keys.
/// \tparam MediumWarpSize - the logical warp size of the medium segment processing kernel.
template
<
class
Key
,
unsigned
int
MediumWarpSize
=
ROCPRIM_WARP_SIZE_32
>
using
select_warp_sort_config_t
=
std
::
conditional_t
<
sizeof
(
Key
)
<
2
,
DisabledWarpSortConfig
,
WarpSortConfig
<
32
,
//< logical warp size - small kernel
4
,
//< items per thread - small kernel
256
,
//< block size - small kernel
3000
,
//< partitioning threshold
(
sizeof
(
Key
)
>
2
),
//< enable unpartitioned warp sort
MediumWarpSize
,
//< logical warp size - medium kernel
4
,
//< items per thread - medium kernel
256
//< block size - medium kernel
>>
;
/// \brief Configuration of device-level segmented radix sort operation.
///
/// Radix sort is excecuted in a few iterations (passes) depending on total number of bits to be sorted
/// (\p begin_bit and \p end_bit), each iteration sorts either \p LongRadixBits or \p ShortRadixBits bits
/// choosen to cover whole bit range in optimal way.
///
/// For example, if \p LongRadixBits is 7, \p ShortRadixBits is 6, \p begin_bit is 0 and \p end_bit is 32
/// there will be 5 iterations: 7 + 7 + 6 + 6 + 6 = 32 bits.
///
/// If a segment's element count is low ( <= warp_sort_config::items_per_thread * warp_sort_config::logical_warp_size ),
/// it is sorted by a special warp-level sorting method.
///
/// \tparam LongRadixBits - number of bits in long iterations.
/// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits.
/// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config.
/// \tparam WarpSortConfig - configuration of the warp sort that is used on the short segments.
template
<
unsigned
int
LongRadixBits
,
unsigned
int
ShortRadixBits
,
class
SortConfig
,
class
WarpSortConfig
=
DisabledWarpSortConfig
>
struct
segmented_radix_sort_config
{
/// \brief Number of bits in long iterations.
static
constexpr
unsigned
int
long_radix_bits
=
LongRadixBits
;
/// \brief Number of bits in short iterations
static
constexpr
unsigned
int
short_radix_bits
=
ShortRadixBits
;
/// \brief Configuration of radix sort kernel.
using
sort
=
SortConfig
;
/// \brief Configuration of the warp sort method.
using
warp_sort_config
=
WarpSortConfig
;
};
namespace
detail
{
template
<
class
Key
,
class
Value
>
struct
segmented_radix_sort_config_803
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
8
,
7
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
8
,
7
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
13
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
,
select_warp_sort_config_t
<
Key
>
>
>
;
};
template
<
class
Key
>
struct
segmented_radix_sort_config_803
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
segmented_radix_sort_config
<
8
,
7
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
segmented_radix_sort_config
<
8
,
7
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
9
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
8
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
7
>
,
select_warp_sort_config_t
<
Key
>
>
>
>
{
};
template
<
class
Key
,
class
Value
>
struct
segmented_radix_sort_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
4
,
4
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
,
select_warp_sort_config_t
<
Key
>
>
>
;
};
template
<
class
Key
>
struct
segmented_radix_sort_config_900
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
segmented_radix_sort_config
<
4
,
3
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
segmented_radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
17
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
8
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
>
>
>
>
{
};
template
<
class
Key
,
class
Value
>
struct
segmented_radix_sort_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
4
,
4
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
,
ROCPRIM_WARP_SIZE_64
>>>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
,
ROCPRIM_WARP_SIZE_64
>>>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
,
ROCPRIM_WARP_SIZE_64
>>>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
,
ROCPRIM_WARP_SIZE_64
>>>
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
,
select_warp_sort_config_t
<
Key
,
ROCPRIM_WARP_SIZE_64
>>>
;
};
template
<
class
Key
>
struct
segmented_radix_sort_config_90a
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
segmented_radix_sort_config
<
4
,
3
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
,
ROCPRIM_WARP_SIZE_64
>>>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
segmented_radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
,
ROCPRIM_WARP_SIZE_64
>>>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
17
>
,
select_warp_sort_config_t
<
Key
,
ROCPRIM_WARP_SIZE_64
>>>
,
select_type_case
<
sizeof
(
Key
)
==
8
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
,
ROCPRIM_WARP_SIZE_64
>>>>
{};
template
<
class
Key
,
class
Value
>
struct
segmented_radix_sort_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
::
rocprim
::
max
(
sizeof
(
Key
),
sizeof
(
Value
)),
sizeof
(
int
));
using
type
=
select_type
<
select_type_case
<
(
sizeof
(
Key
)
==
1
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
4
,
4
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
2
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
4
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
(
sizeof
(
Key
)
==
8
&&
sizeof
(
Value
)
<=
8
),
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
)
>
,
select_warp_sort_config_t
<
Key
>
>
>
;
};
template
<
class
Key
>
struct
segmented_radix_sort_config_1030
<
Key
,
empty_type
>
:
select_type
<
select_type_case
<
sizeof
(
Key
)
==
1
,
segmented_radix_sort_config
<
4
,
3
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
2
,
segmented_radix_sort_config
<
6
,
5
,
kernel_config
<
256
,
10
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
4
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
17
>
,
select_warp_sort_config_t
<
Key
>
>
>
,
select_type_case
<
sizeof
(
Key
)
==
8
,
segmented_radix_sort_config
<
7
,
6
,
kernel_config
<
256
,
15
>
,
select_warp_sort_config_t
<
Key
>
>
>
>
{
};
template
<
unsigned
int
TargetArch
,
class
Key
,
class
Value
>
struct
default_segmented_radix_sort_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
detail
::
segmented_radix_sort_config_803
<
Key
,
Value
>>
,
select_arch_case
<
900
,
detail
::
segmented_radix_sort_config_900
<
Key
,
Value
>>
,
select_arch_case
<
906
,
detail
::
segmented_radix_sort_config_90a
<
Key
,
Value
>>
,
select_arch_case
<
908
,
detail
::
segmented_radix_sort_config_90a
<
Key
,
Value
>>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
detail
::
segmented_radix_sort_config_90a
<
Key
,
Value
>>
,
select_arch_case
<
1030
,
detail
::
segmented_radix_sort_config_1030
<
Key
,
Value
>>
,
detail
::
segmented_radix_sort_config_900
<
Key
,
Value
>>
{};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_CONFIG_HPP_
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment