Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
GLM-130B_fastertransformer
Commits
f8a481f8
Commit
f8a481f8
authored
Oct 13, 2023
by
zhouxiang
Browse files
添加dtk中的cub头文件
parent
7b7c64c5
Changes
147
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5435 additions
and
0 deletions
+5435
-0
3rdparty/cub/rocprim/device/device_segmented_reduce.hpp
3rdparty/cub/rocprim/device/device_segmented_reduce.hpp
+276
-0
3rdparty/cub/rocprim/device/device_segmented_scan.hpp
3rdparty/cub/rocprim/device/device_segmented_scan.hpp
+643
-0
3rdparty/cub/rocprim/device/device_select.hpp
3rdparty/cub/rocprim/device/device_select.hpp
+490
-0
3rdparty/cub/rocprim/device/device_select_config.hpp
3rdparty/cub/rocprim/device/device_select_config.hpp
+161
-0
3rdparty/cub/rocprim/device/device_transform.hpp
3rdparty/cub/rocprim/device/device_transform.hpp
+295
-0
3rdparty/cub/rocprim/device/device_transform_config.hpp
3rdparty/cub/rocprim/device/device_transform_config.hpp
+100
-0
3rdparty/cub/rocprim/device/specialization/device_radix_merge_sort.hpp
...rocprim/device/specialization/device_radix_merge_sort.hpp
+184
-0
3rdparty/cub/rocprim/device/specialization/device_radix_single_sort.hpp
...ocprim/device/specialization/device_radix_single_sort.hpp
+1010
-0
3rdparty/cub/rocprim/functional.hpp
3rdparty/cub/rocprim/functional.hpp
+384
-0
3rdparty/cub/rocprim/intrinsics.hpp
3rdparty/cub/rocprim/intrinsics.hpp
+33
-0
3rdparty/cub/rocprim/intrinsics/atomic.hpp
3rdparty/cub/rocprim/intrinsics/atomic.hpp
+75
-0
3rdparty/cub/rocprim/intrinsics/bit.hpp
3rdparty/cub/rocprim/intrinsics/bit.hpp
+61
-0
3rdparty/cub/rocprim/intrinsics/thread.hpp
3rdparty/cub/rocprim/intrinsics/thread.hpp
+344
-0
3rdparty/cub/rocprim/intrinsics/warp.hpp
3rdparty/cub/rocprim/intrinsics/warp.hpp
+151
-0
3rdparty/cub/rocprim/intrinsics/warp_shuffle.hpp
3rdparty/cub/rocprim/intrinsics/warp_shuffle.hpp
+262
-0
3rdparty/cub/rocprim/iterator.hpp
3rdparty/cub/rocprim/iterator.hpp
+37
-0
3rdparty/cub/rocprim/iterator/arg_index_iterator.hpp
3rdparty/cub/rocprim/iterator/arg_index_iterator.hpp
+266
-0
3rdparty/cub/rocprim/iterator/constant_iterator.hpp
3rdparty/cub/rocprim/iterator/constant_iterator.hpp
+261
-0
3rdparty/cub/rocprim/iterator/counting_iterator.hpp
3rdparty/cub/rocprim/iterator/counting_iterator.hpp
+269
-0
3rdparty/cub/rocprim/iterator/detail/replace_first_iterator.hpp
...ty/cub/rocprim/iterator/detail/replace_first_iterator.hpp
+133
-0
No files found.
Too many changes to show.
To preserve performance only
147 of 147+
files are displayed.
Plain diff
Email patch
3rdparty/cub/rocprim/device/device_segmented_reduce.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
#include <type_traits>
#include <iterator>
#include <chrono>
#include "device_reduce_config.hpp"
#include "../config.hpp"
#include "../functional.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "detail/device_segmented_reduce.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
template
<
class
Config
,
class
InputIterator
,
class
OutputIterator
,
class
OffsetIterator
,
class
ResultType
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
ROCPRIM_DEFAULT_MAX_BLOCK_SIZE
)
void
segmented_reduce_kernel
(
InputIterator
input
,
OutputIterator
output
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
BinaryFunction
reduce_op
,
ResultType
initial_value
)
{
segmented_reduce
<
Config
>
(
input
,
output
,
begin_offsets
,
end_offsets
,
reduce_op
,
initial_value
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template
<
class
Config
,
class
InputIterator
,
class
OutputIterator
,
class
OffsetIterator
,
class
InitValueType
,
class
BinaryFunction
>
inline
cudaError_t
segmented_reduce_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
BinaryFunction
reduce_op
,
InitValueType
initial_value
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
using
result_type
=
typename
::
rocprim
::
detail
::
match_result_type
<
input_type
,
BinaryFunction
>::
type
;
// Get default config if Config is default_config
using
config
=
default_or_custom_config
<
Config
,
default_reduce_config
<
ROCPRIM_TARGET_ARCH
,
result_type
>
>
;
constexpr
unsigned
int
block_size
=
config
::
block_size
;
if
(
temporary_storage
==
nullptr
)
{
// Make sure user won't try to allocate 0 bytes memory, because
// cudaMalloc will return nullptr when size is zero.
storage_size
=
4
;
return
cudaSuccess
;
}
if
(
segments
==
0u
)
return
cudaSuccess
;
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
segmented_reduce_kernel
<
config
>
<<<
dim3
(
segments
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
,
output
,
begin_offsets
,
end_offsets
,
reduce_op
,
static_cast
<
result_type
>
(
initial_value
)
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"segmented_reduce"
,
segments
,
start
);
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
}
// end of detail namespace
/// \brief Parallel segmented reduction primitive for device level.
///
/// segmented_reduce function performs a device-wide reduction operation across multiple sequences
/// using binary \p reduce_op operator.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input must have at least \p size elements, \p output must have
/// \p segments elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for reduction. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
/// \tparam InitValueType - type of the initial value.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the reduction operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to reduce.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] initial_value - initial value to start the reduction.
/// \param [in] reduce_op - binary operation function object that will be used for reduction.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful reduction; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented min-reduction operation is performed on an array of
/// integer values (<tt>short</tt>s are reduced into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom reduce function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// unsigned int segments; // e.g., 3
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 3 elements
/// int * offsets; // e.g. [0, 2, 3, 8]
/// int init_value; // e.g., 9
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output,
/// segments, offsets, offsets + 1,
/// min_op, init_value
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform segmented reduction
/// rocprim::segmented_reduce(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output,
/// segments, offsets, offsets + 1,
/// min_op, init_value
/// );
/// // output: [4, 6, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
OffsetIterator
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
,
class
InitValueType
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
>
inline
cudaError_t
segmented_reduce
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
BinaryFunction
reduce_op
=
BinaryFunction
(),
InitValueType
initial_value
=
InitValueType
(),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
return
detail
::
segmented_reduce_impl
<
Config
>
(
temporary_storage
,
storage_size
,
input
,
output
,
segments
,
begin_offsets
,
end_offsets
,
reduce_op
,
initial_value
,
stream
,
debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
3rdparty/cub/rocprim/device/device_segmented_scan.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_
#define ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "../iterator/zip_iterator.hpp"
#include "../iterator/discard_iterator.hpp"
#include "../iterator/transform_iterator.hpp"
#include "../iterator/counting_iterator.hpp"
#include "../types/tuple.hpp"
#include "device_scan_config.hpp"
#include "device_scan.hpp"
#include "detail/device_segmented_scan.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
template
<
bool
Exclusive
,
class
Config
,
class
ResultType
,
class
InputIterator
,
class
OutputIterator
,
class
OffsetIterator
,
class
InitValueType
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
ROCPRIM_DEFAULT_MAX_BLOCK_SIZE
)
void
segmented_scan_kernel
(
InputIterator
input
,
OutputIterator
output
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
InitValueType
initial_value
,
BinaryFunction
scan_op
)
{
segmented_scan
<
Exclusive
,
Config
,
ResultType
>
(
input
,
output
,
begin_offsets
,
end_offsets
,
static_cast
<
ResultType
>
(
initial_value
),
scan_op
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template
<
bool
Exclusive
,
class
Config
,
class
InputIterator
,
class
OutputIterator
,
class
OffsetIterator
,
class
InitValueType
,
class
BinaryFunction
>
inline
cudaError_t
segmented_scan_impl
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
const
InitValueType
initial_value
,
BinaryFunction
scan_op
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
using
result_type
=
typename
std
::
conditional
<
Exclusive
,
InitValueType
,
input_type
>::
type
;
// Get default config if Config is default_config
using
config
=
default_or_custom_config
<
Config
,
default_scan_config
<
ROCPRIM_TARGET_ARCH
,
result_type
>
>
;
constexpr
unsigned
int
block_size
=
config
::
block_size
;
if
(
temporary_storage
==
nullptr
)
{
// Make sure user won't try to allocate 0 bytes memory, because
// cudaMalloc will return nullptr when size is zero.
storage_size
=
4
;
return
cudaSuccess
;
}
if
(
segments
==
0u
)
return
cudaSuccess
;
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
segmented_scan_kernel
<
Exclusive
,
config
,
result_type
>
<<<
dim3
(
segments
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
,
output
,
begin_offsets
,
end_offsets
,
initial_value
,
scan_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"segmented_scan"
,
segments
,
start
);
return
cudaSuccess
;
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
}
// end of detail namespace
/// \brief Parallel segmented inclusive scan primitive for device level.
///
/// segmented_inclusive_scan function performs a device-wide inclusive scan operation
/// across multiple sequences from \p input using binary \p scan_op operator.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p output must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented inclusive min-scan operation is performed on
/// an array of integer values (<tt>short</tt>s are scanned into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom scan function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 8 elements
/// size_t segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 4, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, segments, offsets, offsets + 1, min_op
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, segments, offsets, offsets + 1, min_op
/// );
/// // output: [4, 4, 6, 2, 5, 1, 1, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
OffsetIterator
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
>
inline
cudaError_t
segmented_inclusive_scan
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
BinaryFunction
scan_op
=
BinaryFunction
(),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
using
result_type
=
input_type
;
return
detail
::
segmented_scan_impl
<
false
,
Config
>
(
temporary_storage
,
storage_size
,
input
,
output
,
segments
,
begin_offsets
,
end_offsets
,
result_type
(),
scan_op
,
stream
,
debug_synchronous
);
}
/// \brief Parallel segmented exclusive scan primitive for device level.
///
/// segmented_exclusive_scan function performs a device-wide exclusive scan operation
/// across multiple sequences from \p input using binary \p scan_op operator.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p output must have at least \p size elements.
/// * Ranges specified by \p begin_offsets and \p end_offsets must have
/// at least \p segments elements. They may use the same sequence <tt>offsets</tt> of at least
/// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
/// <tt>offsets + 1</tt> for \p end_offsets.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] segments - number of segments in the input range.
/// \param [in] begin_offsets - iterator to the first element in the range of beginning offsets.
/// \param [in] end_offsets - iterator to the first element in the range of ending offsets.
/// \param [in] initial_value - initial value to start the scan.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented exclusive min-scan operation is performed on
/// an array of integer values (<tt>short</tt>s are scanned into <tt>int</tt>s) using custom operator.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom scan function
/// auto min_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a < b ? a : b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// int start_value; // e.g., 9
/// short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
/// int * output; // empty array of 8 elements
/// size_t segments; // e.g., 3
/// int * offsets; // e.g. [0, 2, 4, 8]
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, segments, offsets, offsets + 1
/// start_value, min_op
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, segments, offsets, offsets + 1
/// start_value, min_op
/// );
/// // output: [9, 4, 9, 6, 9, 5, 1, 1]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
OffsetIterator
,
class
InitValueType
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
>
inline
cudaError_t
segmented_exclusive_scan
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
unsigned
int
segments
,
OffsetIterator
begin_offsets
,
OffsetIterator
end_offsets
,
const
InitValueType
initial_value
,
BinaryFunction
scan_op
=
BinaryFunction
(),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
return
detail
::
segmented_scan_impl
<
true
,
Config
>
(
temporary_storage
,
storage_size
,
input
,
output
,
segments
,
begin_offsets
,
end_offsets
,
initial_value
,
scan_op
,
stream
,
debug_synchronous
);
}
/// \brief Parallel segmented inclusive scan primitive for device level.
///
/// segmented_inclusive_scan function performs a device-wide inclusive scan operation
/// across multiple sequences from \p input using binary \p scan_op operator. Beginnings
/// of the segments should be marked by value convertible to \p true at corresponding
/// position in \p flags range.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements.
/// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam HeadFlagIterator - random-access iterator type of flags. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] head_flags - iterator to the first element in the range of head flags marking
/// beginnings of each segment in the input range.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented inclusive sum operation is performed on
/// an array of integer values (<tt>short</tt>s are added into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * flags; // e.g., [1, 0, 0, 1, 0, 1, 0, 0]
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, flags, size, ::rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::inclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, flags, size, ::rocprim::plus<int>()
/// );
/// // output: [1, 3, 6, 4, 9, 6, 13, 21]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
HeadFlagIterator
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
>
inline
cudaError_t
segmented_inclusive_scan
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
HeadFlagIterator
head_flags
,
size_t
size
,
BinaryFunction
scan_op
=
BinaryFunction
(),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
using
result_type
=
input_type
;
using
flag_type
=
typename
std
::
iterator_traits
<
HeadFlagIterator
>::
value_type
;
using
headflag_scan_op_wrapper_type
=
detail
::
headflag_scan_op_wrapper
<
result_type
,
flag_type
,
BinaryFunction
>
;
return
inclusive_scan
<
Config
>
(
temporary_storage
,
storage_size
,
rocprim
::
make_zip_iterator
(
rocprim
::
make_tuple
(
input
,
head_flags
)),
rocprim
::
make_zip_iterator
(
rocprim
::
make_tuple
(
output
,
rocprim
::
make_discard_iterator
())),
size
,
headflag_scan_op_wrapper_type
(
scan_op
),
stream
,
debug_synchronous
);
}
/// \brief Parallel segmented exclusive scan primitive for device level.
///
/// segmented_exclusive_scan function performs a device-wide exclusive scan operation
/// across multiple sequences from \p input using binary \p scan_op operator. Beginnings
/// of the segments should be marked by value convertible to \p true at corresponding
/// position in \p flags range.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements.
/// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam HeadFlagIterator - random-access iterator type of flags. Must meet the
/// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
/// \tparam InitValueType - type of the initial value.
/// \tparam BinaryFunction - type of binary function used for scan operation. Default type
/// is \p rocprim::plus<T>, where \p T is a \p value_type of \p InputIterator.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the scan operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to scan.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] head_flags - iterator to the first element in the range of head flags marking
/// beginnings of each segment in the input range.
/// \param [in] initial_value - initial value to start the scan.
/// \param [in] size - number of element in the input range.
/// \param [in] scan_op - binary operation function object that will be used for scan.
/// The signature of the function should be equivalent to the following:
/// <tt>T f(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the objects passed to it.
/// The default value is \p BinaryFunction().
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \returns \p cudaSuccess (\p 0) after successful scan; otherwise a HIP runtime error of
/// type \p cudaError_t.
///
/// \par Example
/// \parblock
/// In this example a device-level segmented exclusive sum operation is performed on
/// an array of integer values (<tt>short</tt>s are added into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * flags; // e.g., [1, 0, 0, 1, 0, 1, 0, 0]
/// int init; // e.g., 9
/// int * output; // empty array of 8 elements
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::segmented_exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, flags, init, size, ::rocprim::plus<int>()
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform scan
/// rocprim::exclusive_scan(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, flags, init, size, ::rocprim::plus<int>()
/// );
/// // output: [9, 10, 12, 9, 13, 9, 15, 22]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
InitValueType
,
class
HeadFlagIterator
,
class
BinaryFunction
=
::
rocprim
::
plus
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
>
inline
cudaError_t
segmented_exclusive_scan
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
HeadFlagIterator
head_flags
,
const
InitValueType
initial_value
,
size_t
size
,
BinaryFunction
scan_op
=
BinaryFunction
(),
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
using
result_type
=
InitValueType
;
using
flag_type
=
typename
std
::
iterator_traits
<
HeadFlagIterator
>::
value_type
;
using
headflag_scan_op_wrapper_type
=
detail
::
headflag_scan_op_wrapper
<
result_type
,
flag_type
,
BinaryFunction
>
;
const
result_type
initial_value_converted
=
static_cast
<
result_type
>
(
initial_value
);
// Flag the last item of each segment as the next segment's head, use initial_value as its value,
// then run exclusive scan
return
exclusive_scan
<
Config
>
(
temporary_storage
,
storage_size
,
rocprim
::
make_transform_iterator
(
rocprim
::
make_counting_iterator
<
size_t
>
(
0
),
[
input
,
head_flags
,
initial_value_converted
,
size
]
ROCPRIM_DEVICE
(
const
size_t
i
)
{
flag_type
flag
(
false
);
if
(
i
+
1
<
size
)
{
flag
=
head_flags
[
i
+
1
];
}
result_type
value
=
initial_value_converted
;
if
(
!
flag
)
{
value
=
input
[
i
];
}
return
rocprim
::
make_tuple
(
value
,
flag
);
}
),
rocprim
::
make_zip_iterator
(
rocprim
::
make_tuple
(
output
,
rocprim
::
make_discard_iterator
())),
rocprim
::
make_tuple
(
initial_value_converted
,
flag_type
(
true
)),
// init value is a head of the first segment
size
,
headflag_scan_op_wrapper_type
(
scan_op
),
stream
,
debug_synchronous
);
}
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_SCAN_HPP_
3rdparty/cub/rocprim/device/device_select.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
#define ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/binary_op_wrappers.hpp"
#include "../iterator/transform_iterator.hpp"
#include "device_scan.hpp"
#include "device_partition.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
if(error != cudaSuccess) return error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto error = cudaStreamSynchronize(stream); \
if(error != cudaSuccess) return error; \
auto end = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
} \
}
}
// end detail namespace
/// \brief Parallel select primitive for device level using range of flags.
///
/// Performs a device-wide selection based on input \p flags. If a value from \p input
/// should be selected and copied into \p output range the corresponding item from
/// \p flags range should be set to such value that can be implicitly converted to
/// \p true (\p bool type).
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Ranges specified by \p input and \p flags must have at least \p size elements.
/// * Range specified by \p output must have at least so many elements, that all positively
/// flagged values can be copied into it.
/// * Range specified by \p selected_count_output must have at least 1 element.
/// * Values of \p flag range should be implicitly convertible to `bool` type.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam FlagIterator - random-access iterator type of the flag range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [in] flags - iterator to the selection flag corresponding to the first element from \p input range.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level select operation is performed on an array of
/// integer values with array of <tt>char</tt>s used as flags.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// char * flags; // e.g., [0, 1, 1, 0, 0, 1, 0, 1]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::select(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, flags,
/// output, output_count,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform selection
/// rocprim::select(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, flags,
/// output, output_count,
/// input_size
/// );
/// // output: [2, 3, 6, 8]
/// // output_count: 4
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
FlagIterator
,
class
OutputIterator
,
class
SelectedCountOutputIterator
>
inline
cudaError_t
select
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
FlagIterator
flags
,
OutputIterator
output
,
SelectedCountOutputIterator
selected_count_output
,
const
size_t
size
,
const
cudaStream_t
stream
=
0
,
const
bool
debug_synchronous
=
false
)
{
// Dummy unary predicate
using
unary_predicate_type
=
::
rocprim
::
empty_type
;
// Dummy inequality operation
using
inequality_op_type
=
::
rocprim
::
empty_type
;
using
offset_type
=
unsigned
int
;
rocprim
::
empty_type
*
const
no_values
=
nullptr
;
// key only
return
detail
::
partition_impl
<
detail
::
select_method
::
flag
,
true
,
Config
,
offset_type
>
(
temporary_storage
,
storage_size
,
input
,
no_values
,
flags
,
output
,
no_values
,
selected_count_output
,
size
,
inequality_op_type
(),
stream
,
debug_synchronous
,
unary_predicate_type
()
);
}
/// \brief Parallel select primitive for device level using selection operator.
///
/// Performs a device-wide selection using selection operator. If a value \p x from \p input
/// should be selected and copied into \p output range, then <tt>predicate(x)</tt> has to
/// return \p true.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage in a null pointer.
/// * Range specified by \p input must have at least \p size elements.
/// * Range specified by \p output must have at least so many elements, that all selected
/// values can be copied into it.
/// * Range specified by \p selected_count_output must have at least 1 element.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam SelectedCountOutputIterator - random-access iterator type of the selected_count_output
/// value. It can be a simple pointer type.
/// \tparam UnaryPredicate - type of a unary selection predicate.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the select operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] selected_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] predicate - unary function object that will be used for selecting values.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level select operation is performed on an array of
/// integer values, only even values are selected.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// auto predicate =
/// [] __device__ (int a) -> bool
/// {
/// return (a%2) == 0;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::select(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, output_count,
/// predicate, input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform selection
/// rocprim::select(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, output_count,
/// predicate, input_size
/// );
/// // output: [2, 4, 6, 8]
/// // output_count: 4
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
SelectedCountOutputIterator
,
class
UnaryPredicate
>
inline
cudaError_t
select
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
SelectedCountOutputIterator
selected_count_output
,
const
size_t
size
,
UnaryPredicate
predicate
,
const
cudaStream_t
stream
=
0
,
const
bool
debug_synchronous
=
false
)
{
// Dummy flag type
using
flag_type
=
::
rocprim
::
empty_type
;
using
offset_type
=
unsigned
int
;
flag_type
*
flags
=
nullptr
;
// Dummy inequality operation
using
inequality_op_type
=
::
rocprim
::
empty_type
;
rocprim
::
empty_type
*
const
no_values
=
nullptr
;
// key only
return
detail
::
partition_impl
<
detail
::
select_method
::
predicate
,
true
,
Config
,
offset_type
>
(
temporary_storage
,
storage_size
,
input
,
no_values
,
flags
,
output
,
no_values
,
selected_count_output
,
size
,
inequality_op_type
(),
stream
,
debug_synchronous
,
predicate
);
}
/// \brief Device-level parallel unique primitive.
///
/// From given \p input range unique primitive eliminates all but the first element from every
/// consecutive group of equivalent elements and copies them into \p output.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage is a null pointer.
/// * Range specified by \p input must have at least \p size elements.
/// * Range specified by \p output must have at least so many elements, that all selected
/// values can be copied into it.
/// * Range specified by \p unique_count_output must have at least 1 element.
/// * By default <tt>InputIterator::value_type</tt>'s equality operator is used to check
/// if elements are equivalent.
///
/// \tparam InputIterator - random-access iterator type of the input range. It can be
/// a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. It can be
/// a simple pointer type.
/// \tparam UniqueCountOutputIterator - random-access iterator type of the unique_count_output
/// value used to return number of unique values. It can be a simple pointer type.
/// \tparam EqualityOp - type of an binary operator used to compare values for equality.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the unique operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] input - iterator to the first element in the range to select values from.
/// \param [out] output - iterator to the first element in the output range.
/// \param [out] unique_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] equality_op - [optional] binary function object used to compare input values for equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool equal_to(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level unique operation is performed on an array of integer values.
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// int * input; // e.g., [1, 4, 2, 4, 4, 7, 7, 7]
/// int * output; // empty array of 8 elements
/// size_t * output_count; // empty array of 1 element
///
/// size_t temporary_storage_size_bytes;
/// void * temporary_storage_ptr = nullptr;
/// // Get required size of the temporary storage
/// rocprim::unique(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, output_count,
/// input_size
/// );
///
/// // allocate temporary storage
/// cudaMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
///
/// // perform unique operation
/// rocprim::unique(
/// temporary_storage_ptr, temporary_storage_size_bytes,
/// input, output, output_count,
/// input_size
/// );
/// // output: [1, 4, 2, 4, 7]
/// // output_count: 5
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
UniqueCountOutputIterator
,
class
EqualityOp
=
::
rocprim
::
equal_to
<
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
>
inline
cudaError_t
unique
(
void
*
temporary_storage
,
size_t
&
storage_size
,
InputIterator
input
,
OutputIterator
output
,
UniqueCountOutputIterator
unique_count_output
,
const
size_t
size
,
EqualityOp
equality_op
=
EqualityOp
(),
const
cudaStream_t
stream
=
0
,
const
bool
debug_synchronous
=
false
)
{
// Dummy unary predicate
using
unary_predicate_type
=
::
rocprim
::
empty_type
;
using
offset_type
=
unsigned
int
;
// Dummy flag type
using
flag_type
=
::
rocprim
::
empty_type
;
const
flag_type
*
flags
=
nullptr
;
rocprim
::
empty_type
*
const
no_values
=
nullptr
;
// key only
// Convert equality operator to inequality operator
auto
inequality_op
=
detail
::
inequality_wrapper
<
EqualityOp
>
(
equality_op
);
return
detail
::
partition_impl
<
detail
::
select_method
::
unique
,
true
,
Config
,
offset_type
>
(
temporary_storage
,
storage_size
,
input
,
no_values
,
flags
,
output
,
no_values
,
unique_count_output
,
size
,
inequality_op
,
stream
,
debug_synchronous
,
unary_predicate_type
()
);
}
/// \brief Device-level parallel unique by key primitive.
///
/// From given \p input range unique primitive eliminates all but the first element from every
/// consecutive group of equivalent elements and copies them and their corresponding keys into
/// \p output.
///
/// \par Overview
/// * Returns the required size of \p temporary_storage in \p storage_size
/// if \p temporary_storage is a null pointer.
/// * Ranges specified by \p keys_input and value_input must have at least \p size elements each.
/// * Ranges specified by \p keys_output and values_output each must have at least so many elements,
/// that all selected values can be copied into them.
/// * Range specified by \p unique_count_output must have at least 1 element.
/// * By default <tt>InputIterator::value_type</tt>'s equality operator is used to check
/// if elements are equivalent.
///
/// \tparam KeyIterator - random-access iterator type of the input key range. It can be
/// a simple pointer type.
/// \tparam ValueIterator - random-access iterator type of the input value range. It can be
/// a simple pointer type.
/// \tparam OutputKeyIterator - random-access iterator type of the output key range. It can be
/// a simple pointer type.
/// \tparam OutputValueIterator - random-access iterator type of the output value range. It can be
/// a simple pointer type.
/// \tparam UniqueCountOutputIterator - random-access iterator type of the unique_count_output
/// value used to return number of unique keys and values. It can be a simple pointer type.
/// \tparam EqualityOp - type of an binary operator used to compare keys for equality.
///
/// \param [in] temporary_storage - pointer to a device-accessible temporary storage. When
/// a null pointer is passed, the required allocation size (in bytes) is written to
/// \p storage_size and function returns without performing the unique operation.
/// \param [in,out] storage_size - reference to a size (in bytes) of \p temporary_storage.
/// \param [in] keys_input - iterator to the first element in the range to select keys from.
/// \param [in] values_input - iterator to the first element in the range of values corresponding to keys
/// \param [out] keys_output - iterator to the first element in the output key range.
/// \param [out] values_output - iterator to the first element in the output value range.
/// \param [out] unique_count_output - iterator to the total number of selected values (length of \p output).
/// \param [in] size - number of element in the input range.
/// \param [in] equality_op - [optional] binary function object used to compare input values for equality.
/// The signature of the function should be equivalent to the following:
/// <tt>bool equal_to(const T &a, const T &b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
template
<
typename
Config
=
default_config
,
typename
KeyIterator
,
typename
ValueIterator
,
typename
OutputKeyIterator
,
typename
OutputValueIterator
,
typename
UniqueCountOutputIterator
,
typename
EqualityOp
=
::
rocprim
::
equal_to
<
typename
std
::
iterator_traits
<
KeyIterator
>
::
value_type
>>
inline
cudaError_t
unique_by_key
(
void
*
temporary_storage
,
size_t
&
storage_size
,
const
KeyIterator
keys_input
,
const
ValueIterator
values_input
,
const
OutputKeyIterator
keys_output
,
const
OutputValueIterator
values_output
,
const
UniqueCountOutputIterator
unique_count_output
,
const
size_t
size
,
const
EqualityOp
equality_op
=
EqualityOp
(),
const
cudaStream_t
stream
=
0
,
const
bool
debug_synchronous
=
false
)
{
using
offset_type
=
unsigned
int
;
// Dummy flag
::
rocprim
::
empty_type
*
const
no_flags
=
nullptr
;
// Dummy predicate
const
auto
no_predicate
=
::
rocprim
::
empty_type
{};
// Convert equality operator to inequality operator
const
auto
inequality_op
=
detail
::
inequality_wrapper
<
EqualityOp
>
(
equality_op
);
return
detail
::
partition_impl
<
detail
::
select_method
::
unique
,
true
,
Config
,
offset_type
>
(
temporary_storage
,
storage_size
,
keys_input
,
values_input
,
no_flags
,
keys_output
,
values_output
,
unique_count_output
,
size
,
inequality_op
,
stream
,
debug_synchronous
,
no_predicate
);
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
3rdparty/cub/rocprim/device/device_select_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_SELECT_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_SELECT_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../block/block_load.hpp"
#include "../block/block_scan.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level select operation.
///
/// \tparam BlockSize - number of threads in a block.
/// \tparam ItemsPerThread - number of items processed by each thread.
/// \tparam KeyBlockLoadMethod - method for loading input keys.
/// \tparam ValueBlockLoadMethod - method for loading input values.
/// \tparam FlagBlockLoadMethod - method for loading flag values.
/// \tparam BlockScanMethod - algorithm for block scan.
/// \tparam SizeLimit - limit on the number of items for a single select kernel launch.
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
::
rocprim
::
block_load_method
KeyBlockLoadMethod
,
::
rocprim
::
block_load_method
ValueBlockLoadMethod
,
::
rocprim
::
block_load_method
FlagBlockLoadMethod
,
::
rocprim
::
block_scan_algorithm
BlockScanMethod
,
unsigned
int
SizeLimit
=
ROCPRIM_GRID_SIZE_LIMIT
>
struct
select_config
{
/// \brief Number of threads in a block.
static
constexpr
unsigned
int
block_size
=
BlockSize
;
/// \brief Number of items processed by each thread.
static
constexpr
unsigned
int
items_per_thread
=
ItemsPerThread
;
/// \brief Method for loading input keys.
static
constexpr
block_load_method
key_block_load_method
=
KeyBlockLoadMethod
;
/// \brief Method for loading input values.
static
constexpr
block_load_method
value_block_load_method
=
ValueBlockLoadMethod
;
/// \brief Method for loading flag values.
static
constexpr
block_load_method
flag_block_load_method
=
FlagBlockLoadMethod
;
/// \brief Algorithm for block scan.
static
constexpr
block_scan_algorithm
block_scan_method
=
BlockScanMethod
;
/// \brief Limit on the number of items for a single select kernel launch.
static
constexpr
unsigned
int
size_limit
=
SizeLimit
;
};
namespace
detail
{
template
<
class
Key
>
struct
select_config_803
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
),
sizeof
(
int
));
using
type
=
select_config
<
limit_block_size
<
256U
,
sizeof
(
Key
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
13u
/
item_scale
),
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
class
Key
>
struct
select_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Key
),
sizeof
(
int
));
using
type
=
select_config
<
limit_block_size
<
256U
,
sizeof
(
Key
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
),
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
class
Value
>
struct
select_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
select_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_64
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
),
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
class
Value
>
struct
select_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
select_config
<
limit_block_size
<
256U
,
sizeof
(
Value
),
ROCPRIM_WARP_SIZE_32
>::
value
,
::
rocprim
::
max
(
1u
,
15u
/
item_scale
),
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_load_method
::
block_load_transpose
,
::
rocprim
::
block_scan_algorithm
::
using_warp_scan
>
;
};
template
<
unsigned
int
TargetArch
,
class
Key
,
class
/*Value*/
>
struct
default_select_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
select_config_803
<
Key
>>
,
select_arch_case
<
900
,
select_config_900
<
Key
>>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
select_config_90a
<
Key
>>
,
select_arch_case
<
1030
,
select_config_1030
<
Key
>>
,
select_config_803
<
Key
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_SELECT_CONFIG_HPP_
3rdparty/cub/rocprim/device/device_transform.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
#define ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
#include <algorithm>
#include <type_traits>
#include <iterator>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../detail/match_result_type.hpp"
#include "../types/tuple.hpp"
#include "../iterator/zip_iterator.hpp"
#include "device_transform_config.hpp"
#include "detail/device_transform.hpp"
#include <chrono>
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup devicemodule
/// @{
namespace
detail
{
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
class
ResultType
,
class
InputIterator
,
class
OutputIterator
,
class
UnaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
transform_kernel
(
InputIterator
input
,
const
size_t
size
,
OutputIterator
output
,
UnaryFunction
transform_op
)
{
transform_kernel_impl
<
BlockSize
,
ItemsPerThread
,
ResultType
>
(
input
,
size
,
output
,
transform_op
);
}
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
_error = cudaStreamSynchronize(stream); \
if(_error != cudaSuccess) return _error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
}
// end of detail namespace
/// \brief Parallel transform primitive for device level.
///
/// transform function performs a device-wide transformation operation
/// using unary \p transform_op operator.
///
/// \par Overview
/// * Ranges specified by \p input and \p output must have at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or
/// a custom class with the same members.
/// \tparam InputIterator - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam UnaryFunction - type of unary function used for transform.
///
/// \param [in] input - iterator to the first element in the range to transform.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] transform_op - unary operation function object that will be used for transform.
/// The signature of the function should be equivalent to the following:
/// <tt>U f(const T &a);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced in order to check for errors. The default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level transform operation is performed on an array of
/// integer values (<tt>short</tt>s are transformed into <tt>int</tt>s).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom transform function
/// auto transform_op =
/// [] __device__ (int a) -> int
/// {
/// return a + 5;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t input_size; // e.g., 8
/// short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int * output; // empty array of 8 elements
///
/// // perform transform
/// rocprim::transform(
/// input, output, input_size, transform_op
/// );
/// // output: [6, 7, 8, 9, 10, 11, 12, 13]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator
,
class
OutputIterator
,
class
UnaryFunction
>
inline
cudaError_t
transform
(
InputIterator
input
,
OutputIterator
output
,
const
size_t
size
,
UnaryFunction
transform_op
,
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
if
(
size
==
size_t
(
0
)
)
return
cudaSuccess
;
using
input_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
using
result_type
=
typename
::
rocprim
::
detail
::
invoke_result
<
UnaryFunction
,
input_type
>::
type
;
// Get default config if Config is default_config
using
config
=
detail
::
default_or_custom_config
<
Config
,
detail
::
default_transform_config
<
ROCPRIM_TARGET_ARCH
,
result_type
>
>
;
static
constexpr
unsigned
int
block_size
=
config
::
block_size
;
static
constexpr
unsigned
int
items_per_thread
=
config
::
items_per_thread
;
static
constexpr
auto
items_per_block
=
block_size
*
items_per_thread
;
// Start point for time measurements
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
static
constexpr
auto
size_limit
=
config
::
size_limit
;
static
constexpr
auto
number_of_blocks_limit
=
::
rocprim
::
max
<
size_t
>
(
size_limit
/
items_per_block
,
1
);
auto
number_of_blocks
=
(
size
+
items_per_block
-
1
)
/
items_per_block
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"block_size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"number of blocks limit "
<<
number_of_blocks_limit
<<
'\n'
;
std
::
cout
<<
"items_per_block "
<<
items_per_block
<<
'\n'
;
}
static
constexpr
auto
aligned_size_limit
=
number_of_blocks_limit
*
items_per_block
;
// Launch number_of_blocks_limit blocks while there is still at least as many blocks left as the limit
const
auto
number_of_launch
=
(
size
+
aligned_size_limit
-
1
)
/
aligned_size_limit
;
for
(
size_t
i
=
0
,
offset
=
0
;
i
<
number_of_launch
;
++
i
,
offset
+=
aligned_size_limit
)
{
const
auto
current_size
=
std
::
min
(
size
-
offset
,
aligned_size_limit
);
const
auto
current_blocks
=
(
current_size
+
items_per_block
-
1
)
/
items_per_block
;
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
detail
::
transform_kernel
<
block_size
,
items_per_thread
,
result_type
,
InputIterator
,
OutputIterator
,
UnaryFunction
>
<<<
dim3
(
current_blocks
),
dim3
(
block_size
),
0
,
stream
>>>
(
input
+
offset
,
current_size
,
output
+
offset
,
transform_op
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"transform_kernel"
,
current_size
,
start
);
}
return
cudaSuccess
;
}
/// \brief Parallel device-level transform primitive for two inputs.
///
/// transform function performs a device-wide transformation operation
/// on two input ranges using binary \p transform_op operator.
///
/// \par Overview
/// * Ranges specified by \p input1, \p input2, and \p output must have at least \p size elements.
///
/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or
/// a custom class with the same members.
/// \tparam InputIterator1 - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam InputIterator2 - random-access iterator type of the input range. Must meet the
/// requirements of a C++ InputIterator concept. It can be a simple pointer type.
/// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
/// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
/// \tparam BinaryFunction - type of binary function used for transform.
///
/// \param [in] input1 - iterator to the first element in the 1st range to transform.
/// \param [in] input2 - iterator to the first element in the 2nd range to transform.
/// \param [out] output - iterator to the first element in the output range.
/// \param [in] size - number of element in the input range.
/// \param [in] transform_op - binary operation function object that will be used for transform.
/// The signature of the function should be equivalent to the following:
/// <tt>U f(const T1& a, const T2& b);</tt>. The signature does not need to have
/// <tt>const &</tt>, but function object must not modify the object passed to it.
/// \param [in] stream - [optional] HIP stream object. The default is \p 0 (default stream).
/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
/// launch is forced. Default value is \p false.
///
/// \par Example
/// \parblock
/// In this example a device-level transform operation is performed on two arrays of
/// integer values (element-wise sum is performed).
///
/// \code{.cpp}
/// #include <rocprim/rocprim.hpp>
///
/// // custom transform function
/// auto transform_op =
/// [] __device__ (int a, int b) -> int
/// {
/// return a + b;
/// };
///
/// // Prepare input and output (declare pointers, allocate device memory etc.)
/// size_t size; // e.g., 8
/// int* input1; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int* input2; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
/// int* output; // empty array of 8 elements
///
/// // perform transform
/// rocprim::transform(
/// input1, input2, output, input1.size(), transform_op
/// );
/// // output: [2, 4, 6, 8, 10, 12, 14, 16]
/// \endcode
/// \endparblock
template
<
class
Config
=
default_config
,
class
InputIterator1
,
class
InputIterator2
,
class
OutputIterator
,
class
BinaryFunction
>
inline
cudaError_t
transform
(
InputIterator1
input1
,
InputIterator2
input2
,
OutputIterator
output
,
const
size_t
size
,
BinaryFunction
transform_op
,
const
cudaStream_t
stream
=
0
,
bool
debug_synchronous
=
false
)
{
using
value_type1
=
typename
std
::
iterator_traits
<
InputIterator1
>::
value_type
;
using
value_type2
=
typename
std
::
iterator_traits
<
InputIterator2
>::
value_type
;
return
transform
<
Config
>
(
::
rocprim
::
make_zip_iterator
(
::
rocprim
::
make_tuple
(
input1
,
input2
)),
output
,
size
,
detail
::
unpack_binary_op
<
value_type1
,
value_type2
,
BinaryFunction
>
(
transform_op
),
stream
,
debug_synchronous
);
}
#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
/// @}
// end of group devicemodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
3rdparty/cub/rocprim/device/device_transform_config.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_DEVICE_TRANSFORM_CONFIG_HPP_
#define ROCPRIM_DEVICE_DEVICE_TRANSFORM_CONFIG_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../functional.hpp"
#include "../detail/various.hpp"
#include "config_types.hpp"
/// \addtogroup primitivesmodule_deviceconfigs
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief Configuration of device-level transform primitives.
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
unsigned
int
SizeLimit
=
ROCPRIM_GRID_SIZE_LIMIT
>
using
transform_config
=
kernel_config
<
BlockSize
,
ItemsPerThread
,
SizeLimit
>
;
namespace
detail
{
template
<
class
Value
>
struct
transform_config_803
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
transform_config
<
256
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
)
>
;
};
template
<
class
Value
>
struct
transform_config_900
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
transform_config
<
256
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
)
>
;
};
template
<
class
Value
>
struct
transform_config_90a
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
transform_config
<
256
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
)
>
;
};
template
<
class
Value
>
struct
transform_config_1030
{
static
constexpr
unsigned
int
item_scale
=
::
rocprim
::
detail
::
ceiling_div
<
unsigned
int
>
(
sizeof
(
Value
),
sizeof
(
int
));
using
type
=
transform_config
<
256
,
::
rocprim
::
max
(
1u
,
16u
/
item_scale
)
>
;
};
template
<
unsigned
int
TargetArch
,
class
Value
>
struct
default_transform_config
:
select_arch
<
TargetArch
,
select_arch_case
<
803
,
transform_config_803
<
Value
>>
,
select_arch_case
<
900
,
transform_config_900
<
Value
>>
,
select_arch_case
<
ROCPRIM_ARCH_90a
,
transform_config_90a
<
Value
>>
,
select_arch_case
<
1030
,
transform_config_1030
<
Value
>>
,
transform_config_900
<
Value
>
>
{
};
}
// end namespace detail
END_ROCPRIM_NAMESPACE
/// @}
// end of group primitivesmodule_deviceconfigs
#endif // ROCPRIM_DEVICE_DEVICE_TRANSFORM_CONFIG_HPP_
3rdparty/cub/rocprim/device/specialization/device_radix_merge_sort.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_MERGE_SORT_HPP_
#define ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_MERGE_SORT_HPP_
#include "../detail/device_radix_sort.hpp"
#include "../specialization/device_radix_single_sort.hpp"
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
,
class
BinaryFunction
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
radix_block_merge_kernel
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
const
size_t
input_size
,
const
unsigned
int
merge_items_per_block_size
,
BinaryFunction
compare_function
)
{
radix_block_merge_impl
<
BlockSize
,
ItemsPerThread
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
input_size
,
merge_items_per_block_size
,
compare_function
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_merge
(
KeysInputIterator
keys_input
,
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
*
keys_buffer
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
*
values_buffer
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
using
key_type
=
typename
std
::
iterator_traits
<
KeysInputIterator
>::
value_type
;
using
value_type
=
typename
std
::
iterator_traits
<
ValuesInputIterator
>::
value_type
;
constexpr
bool
with_values
=
!
std
::
is_same
<
value_type
,
::
rocprim
::
empty_type
>::
value
;
constexpr
unsigned
int
items_per_thread
=
Config
::
sort_merge
::
items_per_thread
;
constexpr
unsigned
int
block_size
=
Config
::
sort_merge
::
block_size
;
constexpr
unsigned
int
items_per_block
=
block_size
*
items_per_thread
;
const
unsigned
int
current_radix_bits
=
end_bit
-
bit
;
auto
number_of_blocks
=
(
size
+
items_per_block
-
1
)
/
items_per_block
;
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"block size "
<<
block_size
<<
'\n'
;
std
::
cout
<<
"items per thread "
<<
items_per_thread
<<
'\n'
;
std
::
cout
<<
"number of blocks "
<<
number_of_blocks
<<
'\n'
;
std
::
cout
<<
"bit "
<<
bit
<<
'\n'
;
std
::
cout
<<
"current_radix_bits "
<<
current_radix_bits
<<
'\n'
;
}
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
sort_single_kernel
<
block_size
,
items_per_thread
,
Descending
>
<<<
dim3
(
number_of_blocks
),
dim3
(
block_size
),
0
,
stream
>>>
(
keys_input
,
keys_buffer
,
values_input
,
values_buffer
,
size
,
bit
,
current_radix_bits
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"radix_sort_single"
,
size
,
start
)
bool
temporary_store
=
true
;
for
(
unsigned
int
block
=
items_per_block
;
block
<
size
;
block
*=
2
)
{
temporary_store
=
!
temporary_store
;
if
(
temporary_store
)
{
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
if
(
current_radix_bits
==
sizeof
(
key_type
)
*
8
)
{
radix_block_merge_kernel
<
block_size
,
items_per_thread
>
<<<
dim3
(
number_of_blocks
),
dim3
(
block_size
),
0
,
stream
>>>
(
keys_output
,
keys_buffer
,
values_output
,
values_buffer
,
size
,
block
,
radix_merge_compare
<
Descending
,
false
,
key_type
>
()
);
}
else
{
radix_block_merge_kernel
<
block_size
,
items_per_thread
>
<<<
dim3
(
number_of_blocks
),
dim3
(
block_size
),
0
,
stream
>>>
(
keys_output
,
keys_buffer
,
values_output
,
values_buffer
,
size
,
block
,
radix_merge_compare
<
Descending
,
true
,
key_type
>
(
bit
,
current_radix_bits
)
);
}
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"radix_block_merge_kernel"
,
size
,
start
);
}
else
{
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
if
(
current_radix_bits
==
sizeof
(
key_type
)
*
8
)
{
radix_block_merge_kernel
<
block_size
,
items_per_thread
>
<<<
dim3
(
number_of_blocks
),
dim3
(
block_size
),
0
,
stream
>>>
(
keys_buffer
,
keys_output
,
values_buffer
,
values_output
,
size
,
block
,
radix_merge_compare
<
Descending
,
false
,
key_type
>
()
);
}
else
{
radix_block_merge_kernel
<
block_size
,
items_per_thread
>
<<<
dim3
(
number_of_blocks
),
dim3
(
block_size
),
0
,
stream
>>>
(
keys_buffer
,
keys_output
,
values_buffer
,
values_output
,
size
,
block
,
radix_merge_compare
<
Descending
,
true
,
key_type
>
(
bit
,
current_radix_bits
)
);
}
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"radix_block_merge_kernel"
,
size
,
start
);
}
}
if
(
temporary_store
)
{
cudaError_t
error
=
::
rocprim
::
transform
(
keys_buffer
,
keys_output
,
size
,
::
rocprim
::
identity
<
key_type
>
(),
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
if
(
with_values
)
{
cudaError_t
error
=
::
rocprim
::
transform
(
values_buffer
,
values_output
,
size
,
::
rocprim
::
identity
<
value_type
>
(),
stream
,
debug_synchronous
);
if
(
error
!=
cudaSuccess
)
return
error
;
}
}
return
cudaSuccess
;
}
}
// end namespace detail
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_MERGE_SORT_HPP_
3rdparty/cub/rocprim/device/specialization/device_radix_single_sort.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_SINGLE_SORT_HPP_
#define ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_SINGLE_SORT_HPP_
#include "../detail/device_radix_sort.hpp"
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
{ \
auto _error = cudaGetLastError(); \
if(_error != cudaSuccess) return _error; \
if(debug_synchronous) \
{ \
std::cout << name << "(" << size << ")"; \
auto __error = cudaStreamSynchronize(stream); \
if(__error != cudaSuccess) return __error; \
auto _end = std::chrono::high_resolution_clock::now(); \
auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
} \
}
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
ROCPRIM_KERNEL
__launch_bounds__
(
BlockSize
)
void
sort_single_kernel
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
current_radix_bits
)
{
sort_single
<
BlockSize
,
ItemsPerThread
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
current_radix_bits
);
}
template
<
unsigned
int
BlockSize
,
unsigned
int
ItemsPerThread
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
const
unsigned
int
current_radix_bits
=
end_bit
-
bit
;
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
if
(
debug_synchronous
)
{
std
::
cout
<<
"BlockSize "
<<
BlockSize
<<
'\n'
;
std
::
cout
<<
"ItemsPerThread "
<<
ItemsPerThread
<<
'\n'
;
std
::
cout
<<
"bit "
<<
bit
<<
'\n'
;
std
::
cout
<<
"current_radix_bits "
<<
current_radix_bits
<<
'\n'
;
}
if
(
debug_synchronous
)
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
sort_single_kernel
<
BlockSize
,
ItemsPerThread
,
Descending
>
<<<
dim3
(
1
),
dim3
(
BlockSize
),
0
,
stream
>>>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
current_radix_bits
);
ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
(
"radix_sort_single"
,
size
,
start
)
return
cudaSuccess
;
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit64
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
return
radix_sort_single
<
64U
,
1U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit128
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
64U
)
return
radix_sort_single_limit64
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
64U
,
2U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit192
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
128U
)
return
radix_sort_single_limit128
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
64U
,
3U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit256
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
192U
)
return
radix_sort_single_limit192
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
64U
,
4U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit320
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
256U
)
return
radix_sort_single_limit256
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
64U
,
5U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit512
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
320U
)
return
radix_sort_single_limit320
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
256U
,
2U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit768
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
512U
)
return
radix_sort_single_limit512
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
256U
,
3U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit1024
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
768U
)
return
radix_sort_single_limit768
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
256U
,
4U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit1536
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
1024U
)
return
radix_sort_single_limit1024
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
256U
,
6U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit2048
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
1536U
)
return
radix_sort_single_limit1536
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
256U
,
8U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit2560
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
2048U
)
return
radix_sort_single_limit2048
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
256U
,
10U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit3072
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
2560U
)
return
radix_sort_single_limit2560
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
256U
,
12U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit3584
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
3072U
)
return
radix_sort_single_limit3072
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
256U
,
14U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
cudaError_t
radix_sort_single_limit4096
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
{
if
(
!
Config
::
force_single_kernel_config
&&
size
<=
3584U
)
return
radix_sort_single_limit3584
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
256U
,
16U
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
64U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit64
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
64U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
128U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit128
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
128U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
192U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit192
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
192U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
256U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit256
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
256U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
320U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit320
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
320U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
512U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit512
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
512U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
768U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit768
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
768U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
1024U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit1024
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
1024U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
1536U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit1536
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
1536U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
2048U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit2048
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
2048U
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
2560U
,
cudaError_t
>::
type
{
return
radix_sort_single_limit2560
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
2560
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
3072
,
cudaError_t
>::
type
{
return
radix_sort_single_limit3072
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
3072
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
3584
,
cudaError_t
>::
type
{
return
radix_sort_single_limit3584
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
3584
)
&&
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
<=
4096
,
cudaError_t
>::
type
{
return
radix_sort_single_limit4096
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
template
<
class
Config
,
bool
Descending
,
class
KeysInputIterator
,
class
KeysOutputIterator
,
class
ValuesInputIterator
,
class
ValuesOutputIterator
>
inline
auto
radix_sort_single
(
KeysInputIterator
keys_input
,
KeysOutputIterator
keys_output
,
ValuesInputIterator
values_input
,
ValuesOutputIterator
values_output
,
unsigned
int
size
,
unsigned
int
bit
,
unsigned
int
end_bit
,
cudaStream_t
stream
,
bool
debug_synchronous
)
->
typename
std
::
enable_if
<
(
Config
::
sort_single
::
items_per_thread
*
Config
::
sort_single
::
block_size
>
4096
),
cudaError_t
>::
type
{
if
(
size
<
4096
)
return
radix_sort_single_limit4096
<
Config
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
else
return
radix_sort_single
<
Config
::
sort_single
::
block_size
,
Config
::
sort_single
::
items_per_thread
,
Descending
>
(
keys_input
,
keys_output
,
values_input
,
values_output
,
size
,
bit
,
end_bit
,
stream
,
debug_synchronous
);
}
}
// end namespace detail
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_DEVICE_SPECIALIZATION_DEVICE_RADIX_SINGLE_SORT_HPP_
3rdparty/cub/rocprim/functional.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_FUNCTIONAL_HPP_
#define ROCPRIM_FUNCTIONAL_HPP_
#include <functional>
// Meta configuration for rocPRIM
#include "config.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup utilsmodule_functional
/// @{
#define ROCPRIM_PRINT_ERROR_ONCE(message) \
{ \
unsigned int idx = threadIdx.x + (blockIdx.x * blockDim.x); \
idx += threadIdx.y + (blockIdx.y * blockDim.y); \
idx += threadIdx.z + (blockIdx.z * blockDim.z); \
if (idx == 0) \
printf("%s\n", #message); \
}
template
<
class
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
T
max
(
const
T
&
a
,
const
T
&
b
)
{
return
a
<
b
?
b
:
a
;
}
template
<
class
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
T
min
(
const
T
&
a
,
const
T
&
b
)
{
return
a
<
b
?
a
:
b
;
}
template
<
class
T
>
ROCPRIM_HOST_DEVICE
inline
void
swap
(
T
&
a
,
T
&
b
)
{
T
c
=
a
;
a
=
b
;
b
=
c
;
}
template
<
class
T
=
void
>
struct
less
{
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
<
b
;
}
};
template
<
>
struct
less
<
void
>
{
template
<
class
T
,
class
U
>
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
U
&
b
)
const
{
return
a
<
b
;
}
};
template
<
class
T
=
void
>
struct
less_equal
{
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
<=
b
;
}
};
template
<
>
struct
less_equal
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
<=
b
;
}
};
template
<
class
T
=
void
>
struct
greater
{
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
>
b
;
}
};
template
<
>
struct
greater
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
>
b
;
}
};
template
<
class
T
=
void
>
struct
greater_equal
{
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
>=
b
;
}
};
template
<
>
struct
greater_equal
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
>=
b
;
}
};
template
<
class
T
=
void
>
struct
equal_to
{
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
==
b
;
}
};
template
<
>
struct
equal_to
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
==
b
;
}
};
template
<
class
T
=
void
>
struct
not_equal_to
{
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
!=
b
;
}
};
template
<
>
struct
not_equal_to
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
bool
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
!=
b
;
}
};
template
<
class
T
=
void
>
struct
plus
{
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
+
b
;
}
};
template
<
>
struct
plus
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
+
b
;
}
};
template
<
class
T
=
void
>
struct
minus
{
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
-
b
;
}
};
template
<
>
struct
minus
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
-
b
;
}
};
template
<
class
T
=
void
>
struct
multiplies
{
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
*
b
;
}
};
template
<
>
struct
multiplies
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
*
b
;
}
};
template
<
class
T
=
void
>
struct
maximum
{
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
<
b
?
b
:
a
;
}
};
template
<
>
struct
maximum
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
<
b
?
b
:
a
;
}
};
template
<
class
T
=
void
>
struct
minimum
{
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
<
b
?
a
:
b
;
}
};
template
<
>
struct
minimum
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
<
b
?
a
:
b
;
}
};
template
<
class
T
=
void
>
struct
identity
{
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
)
const
{
return
a
;
}
};
template
<
>
struct
identity
<
void
>
{
template
<
typename
T
>
ROCPRIM_HOST_DEVICE
inline
constexpr
T
operator
()(
const
T
&
a
)
const
{
return
a
;
}
};
/**
* \brief Statically determine log2(N), rounded up.
*
* For example:
* Log2<8>::VALUE // 3
* Log2<3>::VALUE // 2
*/
template
<
int
N
,
int
CURRENT_VAL
=
N
,
int
COUNT
=
0
>
struct
Log2
{
/// Static logarithm value
enum
{
VALUE
=
Log2
<
N
,
(
CURRENT_VAL
>>
1
),
COUNT
+
1
>::
VALUE
};
// Inductive case
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template
<
int
N
,
int
COUNT
>
struct
Log2
<
N
,
0
,
COUNT
>
{
enum
{
VALUE
=
(
1
<<
(
COUNT
-
1
)
<
N
)
?
// Base case
COUNT
:
COUNT
-
1
};
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Conditional types
******************************************************************************/
/**
* \brief Type equality test
*/
template
<
typename
A
,
typename
B
>
struct
Equals
{
enum
{
VALUE
=
0
,
NEGATE
=
1
};
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template
<
typename
A
>
struct
Equals
<
A
,
A
>
{
enum
{
VALUE
=
1
,
NEGATE
=
0
};
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
template
<
int
A
>
struct
Int2Type
{
enum
{
VALUE
=
A
};
};
/// @}
// end of group utilsmodule_functional
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_FUNCTIONAL_HPP_
3rdparty/cub/rocprim/intrinsics.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_HPP_
#define ROCPRIM_INTRINSICS_HPP_
// Meta configuration for rocPRIM
#include "config.hpp"
#include "intrinsics/atomic.hpp"
#include "intrinsics/bit.hpp"
#include "intrinsics/thread.hpp"
#include "intrinsics/warp.hpp"
#include "intrinsics/warp_shuffle.hpp"
#endif // ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
3rdparty/cub/rocprim/intrinsics/atomic.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_ATOMIC_HPP_
#define ROCPRIM_INTRINSICS_ATOMIC_HPP_
#include "../config.hpp"
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
atomic_add
(
unsigned
int
*
address
,
unsigned
int
value
)
{
return
::
atomicAdd
(
address
,
value
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
int
atomic_add
(
int
*
address
,
int
value
)
{
return
::
atomicAdd
(
address
,
value
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
float
atomic_add
(
float
*
address
,
float
value
)
{
return
::
atomicAdd
(
address
,
value
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
long
long
atomic_add
(
unsigned
long
long
*
address
,
unsigned
long
long
value
)
{
return
::
atomicAdd
(
address
,
value
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
atomic_wrapinc
(
unsigned
int
*
address
,
unsigned
int
value
)
{
return
::
atomicInc
(
address
,
value
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
atomic_exch
(
unsigned
int
*
address
,
unsigned
int
value
)
{
return
::
atomicExch
(
address
,
value
);
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
long
long
atomic_exch
(
unsigned
long
long
*
address
,
unsigned
long
long
value
)
{
return
::
atomicExch
(
address
,
value
);
}
}
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_ATOMIC_HPP_
3rdparty/cub/rocprim/intrinsics/bit.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_BIT_HPP_
#define ROCPRIM_INTRINSICS_BIT_HPP_
#include "../config.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup intrinsicsmodule
/// @{
/// \brief Returns a single bit at 'i' from 'x'
ROCPRIM_DEVICE
ROCPRIM_INLINE
int
get_bit
(
int
x
,
int
i
)
{
return
(
x
>>
i
)
&
1
;
}
/// \brief Bit count
///
/// Returns the number of bit of \p x set.
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
bit_count
(
unsigned
int
x
)
{
return
__popc
(
x
);
}
/// \brief Bit count
///
/// Returns the number of bit of \p x set.
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
bit_count
(
unsigned
long
long
x
)
{
return
__popcll
(
x
);
}
/// @}
// end of group intrinsicsmodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_BIT_HPP_
3rdparty/cub/rocprim/intrinsics/thread.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_THREAD_HPP_
#define ROCPRIM_INTRINSICS_THREAD_HPP_
#include <atomic>
#include "../config.hpp"
#include "../detail/various.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup intrinsicsmodule
/// @{
// Sizes
/// \brief [DEPRECATED] Returns a number of threads in a hardware warp.
///
/// It is constant for a device.
/// This function is not supported for the gfx1030 architecture and will be removed in a future release.
/// Please use the new host_warp_size() and device_warp_size() functions.
ROCPRIM_HOST_DEVICE
inline
constexpr
unsigned
int
warp_size
()
{
return
warpSize
;
}
/// \brief Returns a number of threads in a hardware warp for the actual device.
/// At host side this constant is available at runtime time only.
///
/// It is constant for a device.
ROCPRIM_HOST
inline
unsigned
int
host_warp_size
()
{
int
default_hip_device
;
cudaError_t
success
=
cudaGetDevice
(
&
default_hip_device
);
cudaDeviceProp
device_prop
;
success
=
cudaGetDeviceProperties
(
&
device_prop
,
default_hip_device
);
if
(
success
!=
cudaSuccess
)
return
-
1
;
else
return
device_prop
.
warpSize
;
};
/// \brief Returns a number of threads in a hardware warp for the actual target.
/// At device side this constant is available at compile time.
///
/// It is constant for a device.
ROCPRIM_DEVICE
ROCPRIM_INLINE
constexpr
unsigned
int
device_warp_size
()
{
return
warpSize
;
}
/// \brief Returns flat size of a multidimensional block (tile).
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
flat_block_size
()
{
return
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
}
/// \brief Returns flat size of a multidimensional tile (block).
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
flat_tile_size
()
{
return
flat_block_size
();
}
// IDs
/// \brief Returns thread identifier in a warp.
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
lane_id
()
{
#ifndef __HIP_CPU_RT__
return
::
__lane_id
();
#else
using
namespace
hip
::
detail
;
return
id
(
Fiber
::
this_fiber
())
%
warpSize
;
#endif
}
/// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile).
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
flat_block_thread_id
()
{
return
(
threadIdx
.
z
*
blockDim
.
y
*
blockDim
.
x
)
+
(
threadIdx
.
y
*
blockDim
.
x
)
+
threadIdx
.
x
;
}
/// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile). Use template parameters to optimize 1D or 2D kernels.
template
<
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
flat_block_thread_id
()
->
typename
std
::
enable_if
<
(
BlockSizeY
==
1
&&
BlockSizeZ
==
1
),
unsigned
int
>::
type
{
return
threadIdx
.
x
;
}
template
<
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
flat_block_thread_id
()
->
typename
std
::
enable_if
<
(
BlockSizeY
>
1
&&
BlockSizeZ
==
1
),
unsigned
int
>::
type
{
return
threadIdx
.
x
+
(
threadIdx
.
y
*
blockDim
.
x
);
}
template
<
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
flat_block_thread_id
()
->
typename
std
::
enable_if
<
(
BlockSizeY
>
1
&&
BlockSizeZ
>
1
),
unsigned
int
>::
type
{
return
threadIdx
.
x
+
(
threadIdx
.
y
*
blockDim
.
x
)
+
(
threadIdx
.
z
*
blockDim
.
y
*
blockDim
.
x
);
}
/// \brief Returns flat (linear, 1D) thread identifier in a multidimensional tile (block).
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
flat_tile_thread_id
()
{
return
flat_block_thread_id
();
}
/// \brief Returns warp id in a block (tile).
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
warp_id
()
{
return
flat_block_thread_id
()
/
device_warp_size
();
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
warp_id
(
unsigned
int
flat_id
)
{
return
flat_id
/
device_warp_size
();
}
/// \brief Returns warp id in a block (tile). Use template parameters to optimize 1D or 2D kernels.
template
<
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
warp_id
()
{
return
flat_block_thread_id
<
BlockSizeX
,
BlockSizeY
,
BlockSizeZ
>
()
/
device_warp_size
();
}
/// \brief Returns flat (linear, 1D) block identifier in a multidimensional grid.
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
flat_block_id
()
{
return
(
blockIdx
.
z
*
gridDim
.
y
*
gridDim
.
x
)
+
(
blockIdx
.
y
*
gridDim
.
x
)
+
blockIdx
.
x
;
}
template
<
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
flat_block_id
()
->
typename
std
::
enable_if
<
(
BlockSizeY
==
1
&&
BlockSizeZ
==
1
),
unsigned
int
>::
type
{
return
blockIdx
.
x
;
}
template
<
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
flat_block_id
()
->
typename
std
::
enable_if
<
(
BlockSizeY
>
1
&&
BlockSizeZ
==
1
),
unsigned
int
>::
type
{
return
blockIdx
.
x
+
(
blockIdx
.
y
*
gridDim
.
x
);
}
template
<
unsigned
int
BlockSizeX
,
unsigned
int
BlockSizeY
,
unsigned
int
BlockSizeZ
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
flat_block_id
()
->
typename
std
::
enable_if
<
(
BlockSizeY
>
1
&&
BlockSizeZ
>
1
),
unsigned
int
>::
type
{
return
blockIdx
.
x
+
(
blockIdx
.
y
*
gridDim
.
x
)
+
(
blockIdx
.
z
*
gridDim
.
y
*
gridDim
.
x
);
}
// Sync
/// \brief Synchronize all threads in a block (tile)
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
syncthreads
()
{
__syncthreads
();
}
/// \brief All lanes in a wave come to convergence point simultaneously
/// with SIMT, thus no special instruction is needed in the ISA
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
wave_barrier
()
{
__builtin_amdgcn_wave_barrier
();
}
namespace
detail
{
/// \brief Returns thread identifier in a multidimensional block (tile) by dimension.
template
<
unsigned
int
Dim
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
block_thread_id
()
{
static_assert
(
Dim
>
2
,
"Dim must be 0, 1 or 2"
);
// dummy return, correct values handled by specializations
return
0
;
}
/// \brief Returns block identifier in a multidimensional grid by dimension.
template
<
unsigned
int
Dim
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
block_id
()
{
static_assert
(
Dim
>
2
,
"Dim must be 0, 1 or 2"
);
// dummy return, correct values handled by specializations
return
0
;
}
/// \brief Returns block size in a multidimensional grid by dimension.
template
<
unsigned
int
Dim
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
block_size
()
{
static_assert
(
Dim
>
2
,
"Dim must be 0, 1 or 2"
);
// dummy return, correct values handled by specializations
return
0
;
}
/// \brief Returns grid size by dimension.
template
<
unsigned
int
Dim
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
grid_size
()
{
static_assert
(
Dim
>
2
,
"Dim must be 0, 1 or 2"
);
// dummy return, correct values handled by specializations
return
0
;
}
#define ROCPRIM_DETAIL_CONCAT(A, B) A B
#define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, dim, suffix) \
template<> \
ROCPRIM_DEVICE ROCPRIM_INLINE \
unsigned int name<dim>() \
{ \
return ROCPRIM_DETAIL_CONCAT(prefix, suffix); \
}
#define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(name, prefix) \
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 0, x) \
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 1, y) \
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 2, z)
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS
(
block_thread_id
,
threadIdx
.)
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS
(
block_id
,
blockIdx
.)
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS
(
block_size
,
blockDim
.)
ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS
(
grid_size
,
gridDim
.)
#undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS
#undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC
#undef ROCPRIM_DETAIL_CONCAT
// Return thread id in a "logical warp", which can be smaller than a hardware warp size.
template
<
unsigned
int
LogicalWarpSize
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
logical_lane_id
()
->
typename
std
::
enable_if
<
detail
::
is_power_of_two
(
LogicalWarpSize
),
unsigned
int
>::
type
{
return
lane_id
()
&
(
LogicalWarpSize
-
1
);
// same as land_id()%WarpSize
}
template
<
unsigned
int
LogicalWarpSize
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
auto
logical_lane_id
()
->
typename
std
::
enable_if
<!
detail
::
is_power_of_two
(
LogicalWarpSize
),
unsigned
int
>::
type
{
return
lane_id
()
%
LogicalWarpSize
;
}
template
<
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
logical_lane_id
<
device_warp_size
()
>
()
{
return
lane_id
();
}
// Return id of "logical warp" in a block
template
<
unsigned
int
LogicalWarpSize
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
logical_warp_id
()
{
return
flat_block_thread_id
()
/
LogicalWarpSize
;
}
template
<
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
logical_warp_id
<
device_warp_size
()
>
()
{
return
warp_id
();
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
memory_fence_system
()
{
::
__threadfence_system
();
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
memory_fence_block
()
{
::
__threadfence_block
();
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
void
memory_fence_device
()
{
::
__threadfence
();
}
}
/// @}
// end of group intrinsicsmodule
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_THREAD_HPP_
3rdparty/cub/rocprim/intrinsics/warp.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_WARP_HPP_
#define ROCPRIM_INTRINSICS_WARP_HPP_
#include "../config.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup intrinsicsmodule
/// @{
/// Evaluate predicate for all active work-items in the warp and return an integer
/// whose <tt>i</tt>-th bit is set if and only if \p predicate is <tt>true</tt>
/// for the <tt>i</tt>-th thread of the warp and the <tt>i</tt>-th thread is active.
///
/// \param predicate - input to be evaluated for all active lanes
ROCPRIM_DEVICE
ROCPRIM_INLINE
lane_mask_type
ballot
(
int
predicate
)
{
return
::
__ballot
(
predicate
);
}
/// \brief Masked bit count
///
/// For each thread, this function returns the number of active threads which
/// have <tt>i</tt>-th bit of \p x set and come before the current thread.
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
masked_bit_count
(
lane_mask_type
x
,
unsigned
int
add
=
0
)
{
int
c
;
#ifndef __HIP_CPU_RT__
#if __AMDGCN_WAVEFRONT_SIZE == 32
#ifdef __CUDACC__
c
=
::
__builtin_amdgcn_mbcnt_lo
(
x
,
add
);
#else
c
=
::
__mbcnt_lo
(
x
,
add
);
#endif
#else
#ifdef __CUDACC__
c
=
::
__builtin_amdgcn_mbcnt_lo
(
static_cast
<
int
>
(
x
),
add
);
c
=
::
__builtin_amdgcn_mbcnt_hi
(
static_cast
<
int
>
(
x
>>
32
),
c
);
#else
c
=
::
__mbcnt_lo
(
static_cast
<
int
>
(
x
),
add
);
c
=
::
__mbcnt_hi
(
static_cast
<
int
>
(
x
>>
32
),
c
);
#endif
#endif
#else
using
namespace
hip
::
detail
;
const
auto
tidx
{
id
(
Fiber
::
this_fiber
())
%
warpSize
};
std
::
bitset
<
warpSize
>
bits
{
x
>>
(
warpSize
-
tidx
)};
c
=
static_cast
<
unsigned
int
>
(
bits
.
count
())
+
add
;
#endif
return
c
;
}
namespace
detail
{
ROCPRIM_DEVICE
ROCPRIM_INLINE
int
warp_any
(
int
predicate
)
{
#ifndef __HIP_CPU_RT__
return
::
__any
(
predicate
);
#else
using
namespace
hip
::
detail
;
const
auto
tidx
{
id
(
Fiber
::
this_fiber
())
%
warpSize
};
auto
&
lds
{
Tile
::
scratchpad
<
std
::
bitset
<
warpSize
>
,
1
>
()[
0
]};
lds
[
tidx
]
=
static_cast
<
bool
>
(
predicate
);
barrier
(
Tile
::
this_tile
());
return
lds
.
any
();
#endif
}
ROCPRIM_DEVICE
ROCPRIM_INLINE
int
warp_all
(
int
predicate
)
{
#ifndef __HIP_CPU_RT__
return
::
__all
(
predicate
);
#else
using
namespace
hip
::
detail
;
const
auto
tidx
{
id
(
Fiber
::
this_fiber
())
%
warpSize
};
auto
&
lds
{
Tile
::
scratchpad
<
std
::
bitset
<
warpSize
>
,
1
>
()[
0
]};
lds
[
tidx
]
=
static_cast
<
bool
>
(
predicate
);
barrier
(
Tile
::
this_tile
());
return
lds
.
all
();
#endif
}
}
// end detail namespace
/// @}
// end of group intrinsicsmodule
/**
* Compute a 32b mask of threads having the same least-significant
* LABEL_BITS of \p label as the calling thread.
*/
template
<
int
LABEL_BITS
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
unsigned
int
MatchAny
(
unsigned
int
label
)
{
unsigned
int
retval
;
// Extract masks of common threads for each bit
ROCPRIM_UNROLL
for
(
int
BIT
=
0
;
BIT
<
LABEL_BITS
;
++
BIT
)
{
unsigned
long
long
mask
;
unsigned
long
long
current_bit
=
1
<<
BIT
;
mask
=
label
&
current_bit
;
bool
bit_match
=
(
mask
==
current_bit
);
mask
=
ballot
(
bit_match
);
if
(
!
bit_match
)
{
mask
=
!
mask
;
}
// Remove peers who differ
retval
=
(
BIT
==
0
)
?
mask
:
retval
&
mask
;
}
return
retval
;
}
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_WARP_HPP_
3rdparty/cub/rocprim/intrinsics/warp_shuffle.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
#define ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "thread.hpp"
/// \addtogroup warpmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
#ifdef __HIP_CPU_RT__
// TODO: consider adding macro checks relaying to std::bit_cast when compiled
// using C++20.
template
<
class
To
,
class
From
>
typename
std
::
enable_if_t
<
sizeof
(
To
)
==
sizeof
(
From
)
&&
std
::
is_trivially_copyable_v
<
From
>
&&
std
::
is_trivially_copyable_v
<
To
>
,
To
>
// constexpr support needs compiler magic
bit_cast
(
const
From
&
src
)
noexcept
{
To
dst
;
std
::
memcpy
(
&
dst
,
&
src
,
sizeof
(
To
));
return
dst
;
}
#endif
template
<
class
T
,
class
ShuffleOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
typename
std
::
enable_if
<
std
::
is_trivially_copyable
<
T
>::
value
&&
(
sizeof
(
T
)
%
sizeof
(
int
)
==
0
),
T
>::
type
warp_shuffle_op
(
const
T
&
input
,
ShuffleOp
&&
op
)
{
constexpr
int
words_no
=
(
sizeof
(
T
)
+
sizeof
(
int
)
-
1
)
/
sizeof
(
int
);
struct
V
{
int
words
[
words_no
];
};
#ifdef __HIP_CPU_RT__
V
a
=
bit_cast
<
V
>
(
input
);
#else
V
a
=
__builtin_bit_cast
(
V
,
input
);
#endif
ROCPRIM_UNROLL
for
(
int
i
=
0
;
i
<
words_no
;
i
++
)
{
a
.
words
[
i
]
=
op
(
a
.
words
[
i
]);
}
#ifdef __HIP_CPU_RT__
return
bit_cast
<
T
>
(
a
);
#else
return
__builtin_bit_cast
(
T
,
a
);
#endif
}
template
<
class
T
,
class
ShuffleOp
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
typename
std
::
enable_if
<!
(
std
::
is_trivially_copyable
<
T
>::
value
&&
(
sizeof
(
T
)
%
sizeof
(
int
)
==
0
)),
T
>::
type
warp_shuffle_op
(
const
T
&
input
,
ShuffleOp
&&
op
)
{
constexpr
int
words_no
=
(
sizeof
(
T
)
+
sizeof
(
int
)
-
1
)
/
sizeof
(
int
);
T
output
;
ROCPRIM_UNROLL
for
(
int
i
=
0
;
i
<
words_no
;
i
++
)
{
const
size_t
s
=
std
::
min
(
sizeof
(
int
),
sizeof
(
T
)
-
i
*
sizeof
(
int
));
int
word
;
#ifdef __HIP_CPU_RT__
std
::
memcpy
(
&
word
,
reinterpret_cast
<
const
char
*>
(
&
input
)
+
i
*
sizeof
(
int
),
s
);
#else
__builtin_memcpy
(
&
word
,
reinterpret_cast
<
const
char
*>
(
&
input
)
+
i
*
sizeof
(
int
),
s
);
#endif
word
=
op
(
word
);
#ifdef __HIP_CPU_RT__
std
::
memcpy
(
reinterpret_cast
<
char
*>
(
&
output
)
+
i
*
sizeof
(
int
),
&
word
,
s
);
#else
__builtin_memcpy
(
reinterpret_cast
<
char
*>
(
&
output
)
+
i
*
sizeof
(
int
),
&
word
,
s
);
#endif
}
return
output
;
}
template
<
class
T
,
int
dpp_ctrl
,
int
row_mask
=
0xf
,
int
bank_mask
=
0xf
,
bool
bound_ctrl
=
false
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
T
warp_move_dpp
(
const
T
&
input
)
{
return
detail
::
warp_shuffle_op
(
input
,
[
=
](
int
v
)
->
int
{
// TODO: clean-up, this function activates based ROCPRIM_DETAIL_USE_DPP, however inclusion and
// parsing of the template happens unconditionally. The condition causing compilation to
// fail is ordinary host-compilers looking at the headers. Non-hipcc compilers don't define
// __builtin_amdgcn_update_dpp, hence fail to parse the template altogether. (Except MSVC
// because even using /permissive- they somehow still do delayed parsing of the body of
// function templates, even though they pinky-swear they don't.)
#if !defined(__HIP_CPU_RT__)
return
::
__builtin_amdgcn_mov_dpp
(
v
,
dpp_ctrl
,
row_mask
,
bank_mask
,
bound_ctrl
);
#else
return
v
;
#endif
}
);
}
/// \brief Swizzle for any data type.
///
/// Each thread in warp obtains \p input from <tt>src_lane</tt>-th thread
/// in warp, where <tt>src_lane</tt> is current lane with a <tt>mask</tt> applied.
///
/// \param input - input to pass to other threads
template
<
class
T
,
int
mask
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
T
warp_swizzle
(
const
T
&
input
)
{
return
detail
::
warp_shuffle_op
(
input
,
[
=
](
int
v
)
->
int
{
return
::
__builtin_amdgcn_ds_swizzle
(
v
,
mask
);
}
);
}
}
// end namespace detail
/// \brief Shuffle for any data type.
///
/// Each thread in warp obtains \p input from <tt>src_lane</tt>-th thread
/// in warp. If \p width is less than device_warp_size() then each subsection of the
/// warp behaves as a separate entity with a starting logical lane id of 0.
/// If \p src_lane is not in [0; \p width) range, the returned value is
/// equal to \p input passed by the <tt>src_lane modulo width</tt> thread.
///
/// Note: The optional \p width parameter must be a power of 2; results are
/// undefined if it is not a power of 2, or it is greater than device_warp_size().
///
/// \param input - input to pass to other threads
/// \param src_lane - warp if of a thread whose \p input should be returned
/// \param width - logical warp width
template
<
class
T
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
T
warp_shuffle
(
const
T
&
input
,
const
int
src_lane
,
const
int
width
=
device_warp_size
())
{
return
detail
::
warp_shuffle_op
(
input
,
[
=
](
int
v
)
->
int
{
return
__shfl
(
v
,
src_lane
,
width
);
}
);
}
/// \brief Shuffle up for any data type.
///
/// <tt>i</tt>-th thread in warp obtains \p input from <tt>i-delta</tt>-th
/// thread in warp. If \p <tt>i-delta</tt> is not in [0; \p width) range,
/// thread's own \p input is returned.
///
/// Note: The optional \p width parameter must be a power of 2; results are
/// undefined if it is not a power of 2, or it is greater than device_warp_size().
///
/// \param input - input to pass to other threads
/// \param delta - offset for calculating source lane id
/// \param width - logical warp width
template
<
class
T
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
T
warp_shuffle_up
(
const
T
&
input
,
const
unsigned
int
delta
,
const
int
width
=
device_warp_size
())
{
return
detail
::
warp_shuffle_op
(
input
,
[
=
](
int
v
)
->
int
{
return
__shfl_up
(
v
,
delta
,
width
);
}
);
}
/// \brief Shuffle down for any data type.
///
/// <tt>i</tt>-th thread in warp obtains \p input from <tt>i+delta</tt>-th
/// thread in warp. If \p <tt>i+delta</tt> is not in [0; \p width) range,
/// thread's own \p input is returned.
///
/// Note: The optional \p width parameter must be a power of 2; results are
/// undefined if it is not a power of 2, or it is greater than device_warp_size().
///
/// \param input - input to pass to other threads
/// \param delta - offset for calculating source lane id
/// \param width - logical warp width
template
<
class
T
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
T
warp_shuffle_down
(
const
T
&
input
,
const
unsigned
int
delta
,
const
int
width
=
device_warp_size
())
{
return
detail
::
warp_shuffle_op
(
input
,
[
=
](
int
v
)
->
int
{
return
__shfl_down
(
v
,
delta
,
width
);
}
);
}
/// \brief Shuffle XOR for any data type.
///
/// <tt>i</tt>-th thread in warp obtains \p input from <tt>i^lane_mask</tt>-th
/// thread in warp.
///
/// Note: The optional \p width parameter must be a power of 2; results are
/// undefined if it is not a power of 2, or it is greater than device_warp_size().
///
/// \param input - input to pass to other threads
/// \param lane_mask - mask used for calculating source lane id
/// \param width - logical warp width
template
<
class
T
>
ROCPRIM_DEVICE
ROCPRIM_INLINE
T
warp_shuffle_xor
(
const
T
&
input
,
const
int
lane_mask
,
const
int
width
=
device_warp_size
())
{
return
detail
::
warp_shuffle_op
(
input
,
[
=
](
int
v
)
->
int
{
return
__shfl_xor
(
v
,
lane_mask
,
width
);
}
);
}
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_INTRINSICS_WARP_SHUFFLE_HPP_
/// @}
// end of group warpmodule
3rdparty/cub/rocprim/iterator.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_HPP_
// Meta configuration for rocPRIM
#include "config.hpp"
#include "iterator/arg_index_iterator.hpp"
#include "iterator/constant_iterator.hpp"
#include "iterator/counting_iterator.hpp"
#include "iterator/discard_iterator.hpp"
#ifndef __HIP_CPU_RT__
#include "iterator/texture_cache_iterator.hpp"
#endif
#include "iterator/transform_iterator.hpp"
#include "iterator/zip_iterator.hpp"
#endif // ROCPRIM_ITERATOR_HPP_
3rdparty/cub/rocprim/iterator/arg_index_iterator.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_ARG_INDEX_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_ARG_INDEX_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include <cstddef>
#include <type_traits>
#include "../config.hpp"
#include "../types/key_value_pair.hpp"
/// \addtogroup iteratormodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \class arg_index_iterator
/// \brief A random-access input (read-only) iterator adaptor for pairing dereferenced values
/// with their indices.
///
/// \par Overview
/// * Dereferencing arg_index_iterator return a value of \p key_value_pair<Difference, InputValueType>
/// type, which includes value from the underlying range and its index in that range.
/// * \p std::iterator_traits<InputIterator>::value_type should be convertible to \p InputValueType.
///
/// \tparam InputIterator - type of the underlying random-access input iterator. Must be
/// a random-access iterator.
/// \tparam Difference - type used for identify distance between iterators and as the index type
/// in the output pair type (see \p value_type).
/// \tparam InputValueType - value type used in the output pair type (see \p value_type).
template
<
class
InputIterator
,
class
Difference
=
std
::
ptrdiff_t
,
class
InputValueType
=
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
class
arg_index_iterator
{
private:
using
input_category
=
typename
std
::
iterator_traits
<
InputIterator
>::
iterator_category
;
public:
/// The type of the value that can be obtained by dereferencing the iterator.
using
value_type
=
::
rocprim
::
key_value_pair
<
Difference
,
InputValueType
>
;
/// \brief A reference type of the type iterated over (\p value_type).
/// It's `const` since arg_index_iterator is a read-only iterator.
using
reference
=
const
value_type
&
;
/// \brief A pointer type of the type iterated over (\p value_type).
/// It's `const` since arg_index_iterator is a read-only iterator.
using
pointer
=
const
value_type
*
;
/// A type used for identify distance between iterators.
using
difference_type
=
Difference
;
/// The category of the iterator.
using
iterator_category
=
std
::
random_access_iterator_tag
;
#ifndef DOXYGEN_SHOULD_SKIP_THIS
using
self_type
=
arg_index_iterator
;
#endif
static_assert
(
std
::
is_same
<
input_category
,
iterator_category
>::
value
,
"InputIterator must be a random-access iterator"
);
ROCPRIM_HOST_DEVICE
inline
~
arg_index_iterator
()
=
default
;
/// \brief Creates a new arg_index_iterator.
///
/// \param iterator input iterator pointing to the input range.
/// \param offset index of the \p iterator in the input range.
ROCPRIM_HOST_DEVICE
inline
arg_index_iterator
(
InputIterator
iterator
,
difference_type
offset
=
0
)
:
iterator_
(
iterator
),
offset_
(
offset
)
{
}
ROCPRIM_HOST_DEVICE
inline
arg_index_iterator
&
operator
++
()
{
iterator_
++
;
offset_
++
;
return
*
this
;
}
//! \skip_doxy_start
ROCPRIM_HOST_DEVICE
inline
arg_index_iterator
operator
++
(
int
)
{
arg_index_iterator
old_ai
=
*
this
;
iterator_
++
;
offset_
++
;
return
old_ai
;
}
ROCPRIM_HOST_DEVICE
inline
value_type
operator
*
()
const
{
value_type
ret
(
offset_
,
*
iterator_
);
return
ret
;
}
ROCPRIM_HOST_DEVICE
inline
pointer
operator
->
()
const
{
return
&
(
*
(
*
this
));
}
ROCPRIM_HOST_DEVICE
inline
arg_index_iterator
operator
+
(
difference_type
distance
)
const
{
return
arg_index_iterator
(
iterator_
+
distance
,
offset_
+
distance
);
}
ROCPRIM_HOST_DEVICE
inline
arg_index_iterator
&
operator
+=
(
difference_type
distance
)
{
iterator_
+=
distance
;
offset_
+=
distance
;
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
arg_index_iterator
operator
-
(
difference_type
distance
)
const
{
return
arg_index_iterator
(
iterator_
-
distance
,
offset_
-
distance
);
}
ROCPRIM_HOST_DEVICE
inline
arg_index_iterator
&
operator
-=
(
difference_type
distance
)
{
iterator_
-=
distance
;
offset_
-=
distance
;
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
difference_type
operator
-
(
arg_index_iterator
other
)
const
{
return
iterator_
-
other
.
iterator_
;
}
ROCPRIM_HOST_DEVICE
inline
value_type
operator
[](
difference_type
distance
)
const
{
arg_index_iterator
i
=
(
*
this
)
+
distance
;
return
*
i
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
==
(
arg_index_iterator
other
)
const
{
return
(
iterator_
==
other
.
iterator_
)
&&
(
offset_
==
other
.
offset_
);
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
!=
(
arg_index_iterator
other
)
const
{
return
(
iterator_
!=
other
.
iterator_
)
||
(
offset_
!=
other
.
offset_
);
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
<
(
arg_index_iterator
other
)
const
{
return
(
iterator_
-
other
.
iterator_
)
>
0
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
<=
(
arg_index_iterator
other
)
const
{
return
(
iterator_
-
other
.
iterator_
)
>=
0
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
>
(
arg_index_iterator
other
)
const
{
return
(
iterator_
-
other
.
iterator_
)
<
0
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
>=
(
arg_index_iterator
other
)
const
{
return
(
iterator_
-
other
.
iterator_
)
<=
0
;
}
ROCPRIM_HOST_DEVICE
inline
void
normalize
()
{
offset_
=
0
;
}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
arg_index_iterator
&
/* iter */
)
{
return
os
;
}
//! \skip_doxy_end
private:
InputIterator
iterator_
;
difference_type
offset_
;
};
template
<
class
InputIterator
,
class
Difference
,
class
InputValueType
>
ROCPRIM_HOST_DEVICE
inline
arg_index_iterator
<
InputIterator
,
Difference
,
InputValueType
>
operator
+
(
typename
arg_index_iterator
<
InputIterator
,
Difference
,
InputValueType
>::
difference_type
distance
,
const
arg_index_iterator
<
InputIterator
,
Difference
,
InputValueType
>&
iterator
)
{
return
iterator
+
distance
;
}
/// make_arg_index_iterator creates a arg_index_iterator using \p iterator as
/// the underlying iterator and \p offset as the position (index) of \p iterator
/// in the input range.
///
/// \tparam InputIterator - type of the underlying random-access input iterator. Must be
/// a random-access iterator.
/// \tparam Difference - type used for identify distance between iterators and as the index type
/// in the output pair type (see \p value_type in arg_index_iterator).
/// \tparam InputValueType - value type used in the output pair type (see \p value_type
/// in arg_index_iterator).
///
/// \param iterator input iterator pointing to the input range.
/// \param offset index of the \p iterator in the input range.
template
<
class
InputIterator
,
class
Difference
=
std
::
ptrdiff_t
,
class
InputValueType
=
typename
std
::
iterator_traits
<
InputIterator
>
::
value_type
>
ROCPRIM_HOST_DEVICE
inline
arg_index_iterator
<
InputIterator
,
Difference
,
InputValueType
>
make_arg_index_iterator
(
InputIterator
iterator
,
Difference
offset
=
0
)
{
return
arg_index_iterator
<
InputIterator
,
Difference
,
InputValueType
>
(
iterator
,
offset
);
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group iteratormodule
#endif // ROCPRIM_ITERATOR_ARG_INDEX_ITERATOR_HPP_
3rdparty/cub/rocprim/iterator/constant_iterator.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_CONSTANT_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_CONSTANT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include <cstddef>
#include <type_traits>
#include "../config.hpp"
/// \addtogroup iteratormodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \class constant_iterator
/// \brief A random-access input (read-only) iterator which generates a sequence
/// of homogeneous values.
///
/// \par Overview
/// * A constant_iterator represents a pointer into a range of same values.
/// * Using it for simulating a range filled with a sequence of same values saves
/// memory capacity and bandwidth.
///
/// \tparam ValueType - type of value that can be obtained by dereferencing the iterator.
/// \tparam Difference - a type used for identify distance between iterators
template
<
class
ValueType
,
class
Difference
=
std
::
ptrdiff_t
>
class
constant_iterator
{
public:
/// The type of the value that can be obtained by dereferencing the iterator.
using
value_type
=
typename
std
::
remove_const
<
ValueType
>::
type
;
/// \brief A reference type of the type iterated over (\p value_type).
/// It's same as `value_type` since constant_iterator is a read-only
/// iterator and does not have underlying buffer.
using
reference
=
value_type
;
// constant_iterator is not writable
/// \brief A pointer type of the type iterated over (\p value_type).
/// It's `const` since constant_iterator is a read-only iterator.
using
pointer
=
const
value_type
*
;
// constant_iterator is not writable
/// A type used for identify distance between iterators.
using
difference_type
=
Difference
;
/// The category of the iterator.
using
iterator_category
=
std
::
random_access_iterator_tag
;
#ifndef DOXYGEN_SHOULD_SKIP_THIS
using
self_type
=
constant_iterator
;
#endif
/// \brief Creates constant_iterator and sets its initial value to \p value.
///
/// \param value initial value
/// \param index optional index for constant_iterator
ROCPRIM_HOST_DEVICE
inline
explicit
constant_iterator
(
const
value_type
value
,
const
size_t
index
=
0
)
:
value_
(
value
),
index_
(
index
)
{
}
ROCPRIM_HOST_DEVICE
inline
~
constant_iterator
()
=
default
;
//! \skip_doxy_start
ROCPRIM_HOST_DEVICE
inline
value_type
operator
*
()
const
{
return
value_
;
}
ROCPRIM_HOST_DEVICE
inline
pointer
operator
->
()
const
{
return
&
value_
;
}
ROCPRIM_HOST_DEVICE
inline
constant_iterator
&
operator
++
()
{
index_
++
;
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
constant_iterator
operator
++
(
int
)
{
constant_iterator
old_ci
=
*
this
;
index_
++
;
return
old_ci
;
}
ROCPRIM_HOST_DEVICE
inline
constant_iterator
&
operator
--
()
{
index_
--
;
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
constant_iterator
operator
--
(
int
)
{
constant_iterator
old_ci
=
*
this
;
index_
--
;
return
old_ci
;
}
ROCPRIM_HOST_DEVICE
inline
constant_iterator
operator
+
(
difference_type
distance
)
const
{
return
constant_iterator
(
value_
,
index_
+
distance
);
}
ROCPRIM_HOST_DEVICE
inline
constant_iterator
&
operator
+=
(
difference_type
distance
)
{
index_
+=
distance
;
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
constant_iterator
operator
-
(
difference_type
distance
)
const
{
return
constant_iterator
(
value_
,
index_
-
distance
);
}
ROCPRIM_HOST_DEVICE
inline
constant_iterator
&
operator
-=
(
difference_type
distance
)
{
index_
-=
distance
;
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
difference_type
operator
-
(
constant_iterator
other
)
const
{
return
static_cast
<
difference_type
>
(
index_
-
other
.
index_
);
}
//! \skip_doxy_end
/// Constant_iterator is not writable, so we don't return reference,
/// just something convertible to reference. That matches requirement
/// of RandomAccessIterator concept
ROCPRIM_HOST_DEVICE
inline
value_type
operator
[](
difference_type
)
const
{
return
value_
;
}
//! \skip_doxy_start
ROCPRIM_HOST_DEVICE
inline
bool
operator
==
(
constant_iterator
other
)
const
{
return
value_
==
other
.
value_
&&
index_
==
other
.
index_
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
!=
(
constant_iterator
other
)
const
{
return
!
(
*
this
==
other
);
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
<
(
constant_iterator
other
)
const
{
return
distance_to
(
other
)
>
0
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
<=
(
constant_iterator
other
)
const
{
return
distance_to
(
other
)
>=
0
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
>
(
constant_iterator
other
)
const
{
return
distance_to
(
other
)
<
0
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
>=
(
constant_iterator
other
)
const
{
return
distance_to
(
other
)
<=
0
;
}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
constant_iterator
&
iter
)
{
os
<<
"["
<<
iter
.
value_
<<
"]"
;
return
os
;
}
//! \skip_doxy_end
private:
inline
difference_type
distance_to
(
const
constant_iterator
&
other
)
const
{
return
difference_type
(
other
.
index_
)
-
difference_type
(
index_
);
}
value_type
value_
;
size_t
index_
;
};
template
<
class
ValueType
,
class
Difference
>
ROCPRIM_HOST_DEVICE
inline
constant_iterator
<
ValueType
,
Difference
>
operator
+
(
typename
constant_iterator
<
ValueType
,
Difference
>::
difference_type
distance
,
const
constant_iterator
<
ValueType
,
Difference
>&
iter
)
{
return
iter
+
distance
;
}
/// make_constant_iterator creates a constant_iterator with its initial value
/// set to \p value.
///
/// \tparam ValueType - type of value that can be obtained by dereferencing created iterator.
/// \tparam Difference - a type used for identify distance between constant_iterator iterators.
///
/// \param value - initial value for constant_iterator.
/// \param index - optional index for constant_iterator.
template
<
class
ValueType
,
class
Difference
=
std
::
ptrdiff_t
>
ROCPRIM_HOST_DEVICE
inline
constant_iterator
<
ValueType
,
Difference
>
make_constant_iterator
(
ValueType
value
,
size_t
index
=
0
)
{
return
constant_iterator
<
ValueType
,
Difference
>
(
value
,
index
);
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group iteratormodule
#endif // ROCPRIM_ITERATOR_CONSTANT_ITERATOR_HPP_
3rdparty/cub/rocprim/iterator/counting_iterator.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_COUNTING_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_COUNTING_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include <cstddef>
#include <type_traits>
#include "../config.hpp"
#include "../type_traits.hpp"
/// \addtogroup iteratormodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \class counting_iterator
/// \brief A random-access input (read-only) iterator over a sequence of consecutive integer values.
///
/// \par Overview
/// * A counting_iterator represents a pointer into a range of sequentially increasing values.
/// * Using it for simulating a range filled with a sequence of consecutive values saves
/// memory capacity and bandwidth.
///
/// \tparam Incrementable - type of value that can be obtained by dereferencing the iterator.
/// \tparam Difference - a type used for identify distance between iterators
template
<
class
Incrementable
,
class
Difference
=
std
::
ptrdiff_t
>
class
counting_iterator
{
public:
/// The type of the value that can be obtained by dereferencing the iterator.
using
value_type
=
typename
std
::
remove_const
<
Incrementable
>::
type
;
/// \brief A reference type of the type iterated over (\p value_type).
/// It's same as `value_type` since constant_iterator is a read-only
/// iterator and does not have underlying buffer.
using
reference
=
value_type
;
// counting_iterator is not writable
/// \brief A pointer type of the type iterated over (\p value_type).
/// It's `const` since counting_iterator is a read-only iterator.
using
pointer
=
const
value_type
*
;
// counting_iterator is not writable
/// A type used for identify distance between iterators.
using
difference_type
=
Difference
;
/// The category of the iterator.
using
iterator_category
=
std
::
random_access_iterator_tag
;
static_assert
(
std
::
is_integral
<
value_type
>::
value
,
"Incrementable must be integral type"
);
#ifndef DOXYGEN_SHOULD_SKIP_THIS
using
self_type
=
counting_iterator
;
#endif
ROCPRIM_HOST_DEVICE
inline
counting_iterator
()
=
default
;
/// \brief Creates counting_iterator with its initial value initialized
/// to its default value (usually 0).
ROCPRIM_HOST_DEVICE
inline
~
counting_iterator
()
=
default
;
/// \brief Creates counting_iterator and sets its initial value to \p value_.
///
/// \param value initial value
ROCPRIM_HOST_DEVICE
inline
explicit
counting_iterator
(
const
value_type
value
)
:
value_
(
value
)
{
}
//! \skip_doxy_start
ROCPRIM_HOST_DEVICE
inline
counting_iterator
&
operator
++
()
{
value_
++
;
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
counting_iterator
operator
++
(
int
)
{
counting_iterator
old_ci
=
*
this
;
value_
++
;
return
old_ci
;
}
ROCPRIM_HOST_DEVICE
inline
counting_iterator
&
operator
--
()
{
value_
--
;
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
counting_iterator
operator
--
(
int
)
{
counting_iterator
old_ci
=
*
this
;
value_
--
;
return
old_ci
;
}
ROCPRIM_HOST_DEVICE
inline
value_type
operator
*
()
const
{
return
value_
;
}
ROCPRIM_HOST_DEVICE
inline
pointer
operator
->
()
const
{
return
&
value_
;
}
ROCPRIM_HOST_DEVICE
inline
counting_iterator
operator
+
(
difference_type
distance
)
const
{
return
counting_iterator
(
value_
+
static_cast
<
value_type
>
(
distance
));
}
ROCPRIM_HOST_DEVICE
inline
counting_iterator
&
operator
+=
(
difference_type
distance
)
{
value_
+=
static_cast
<
value_type
>
(
distance
);
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
counting_iterator
operator
-
(
difference_type
distance
)
const
{
return
counting_iterator
(
value_
-
static_cast
<
value_type
>
(
distance
));
}
ROCPRIM_HOST_DEVICE
inline
counting_iterator
&
operator
-=
(
difference_type
distance
)
{
value_
-=
static_cast
<
value_type
>
(
distance
);
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
difference_type
operator
-
(
counting_iterator
other
)
const
{
return
static_cast
<
difference_type
>
(
value_
-
other
.
value_
);
}
// counting_iterator is not writable, so we don't return reference,
// just something convertible to reference. That matches requirement
// of RandomAccessIterator concept
ROCPRIM_HOST_DEVICE
inline
value_type
operator
[](
difference_type
distance
)
const
{
return
value_
+
static_cast
<
value_type
>
(
distance
);
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
==
(
counting_iterator
other
)
const
{
return
this
->
equal_value
(
value_
,
other
.
value_
);
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
!=
(
counting_iterator
other
)
const
{
return
!
(
*
this
==
other
);
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
<
(
counting_iterator
other
)
const
{
return
distance_to
(
other
)
>
0
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
<=
(
counting_iterator
other
)
const
{
return
distance_to
(
other
)
>=
0
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
>
(
counting_iterator
other
)
const
{
return
distance_to
(
other
)
<
0
;
}
ROCPRIM_HOST_DEVICE
inline
bool
operator
>=
(
counting_iterator
other
)
const
{
return
distance_to
(
other
)
<=
0
;
}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
counting_iterator
&
iter
)
{
os
<<
"["
<<
iter
.
value_
<<
"]"
;
return
os
;
}
//! \skip_doxy_end
private:
template
<
class
T
>
inline
bool
equal_value
(
const
T
&
x
,
const
T
&
y
)
const
{
return
(
x
==
y
);
}
inline
difference_type
distance_to
(
const
counting_iterator
&
other
)
const
{
return
difference_type
(
other
.
value_
)
-
difference_type
(
value_
);
}
value_type
value_
;
};
template
<
class
Incrementable
,
class
Difference
>
ROCPRIM_HOST_DEVICE
inline
counting_iterator
<
Incrementable
,
Difference
>
operator
+
(
typename
counting_iterator
<
Incrementable
,
Difference
>::
difference_type
distance
,
const
counting_iterator
<
Incrementable
,
Difference
>&
iter
)
{
return
iter
+
distance
;
}
/// make_counting_iterator creates a counting_iterator with its initial value
/// set to \p value.
///
/// \tparam Incrementable - type of value that can be obtained by dereferencing created iterator.
/// \tparam Difference - a type used for identify distance between counting_iterator iterators.
///
/// \param value - initial value for counting_iterator.
template
<
class
Incrementable
,
class
Difference
=
std
::
ptrdiff_t
>
ROCPRIM_HOST_DEVICE
inline
counting_iterator
<
Incrementable
,
Difference
>
make_counting_iterator
(
Incrementable
value
)
{
return
counting_iterator
<
Incrementable
,
Difference
>
(
value
);
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group iteratormodule
#endif // ROCPRIM_ITERATOR_COUNTING_ITERATOR_HPP_
3rdparty/cub/rocprim/iterator/detail/replace_first_iterator.hpp
0 → 100644
View file @
f8a481f8
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_ITERATOR_REPLACE_FIRST_ITERATOR_HPP_
#define ROCPRIM_ITERATOR_REPLACE_FIRST_ITERATOR_HPP_
#include <iterator>
#include <cstddef>
#include <type_traits>
#include "../../config.hpp"
BEGIN_ROCPRIM_NAMESPACE
namespace
detail
{
// Replaces first value of given range with given value. Used in exclusive scan-by-key
// and exclusive segmented scan to avoid allocating additional memory and/or running
// additional kernels.
//
// Important: it does not dereference the first item in given range, so it does not matter
// if it's an invalid pointer.
//
// Usage:
// * input - start of your input range
// * value - value that should be used as first element of new range.
//
// replace_first_iterator<InputIterator>(input - 1, value);
//
// (input - 1) will never be dereferenced.
template
<
class
InputIterator
>
class
replace_first_iterator
{
private:
using
input_category
=
typename
std
::
iterator_traits
<
InputIterator
>::
iterator_category
;
static_assert
(
std
::
is_same
<
input_category
,
std
::
random_access_iterator_tag
>::
value
,
"InputIterator must be a random-access iterator"
);
public:
using
value_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
value_type
;
using
reference
=
value_type
;
using
pointer
=
const
value_type
*
;
using
difference_type
=
typename
std
::
iterator_traits
<
InputIterator
>::
difference_type
;
using
iterator_category
=
std
::
random_access_iterator_tag
;
ROCPRIM_HOST_DEVICE
inline
~
replace_first_iterator
()
=
default
;
ROCPRIM_HOST_DEVICE
inline
replace_first_iterator
(
InputIterator
iterator
,
value_type
value
,
size_t
index
=
0
)
:
iterator_
(
iterator
),
value_
(
value
),
index_
(
index
)
{
}
ROCPRIM_HOST_DEVICE
inline
replace_first_iterator
&
operator
++
()
{
iterator_
++
;
index_
++
;
return
*
this
;
}
ROCPRIM_HOST_DEVICE
inline
replace_first_iterator
operator
++
(
int
)
{
replace_first_iterator
old
=
*
this
;
iterator_
++
;
index_
++
;
return
old
;
}
ROCPRIM_HOST_DEVICE
inline
value_type
operator
*
()
const
{
if
(
index_
==
0
)
{
return
value_
;
}
return
*
iterator_
;
}
ROCPRIM_HOST_DEVICE
inline
value_type
operator
[](
difference_type
distance
)
const
{
replace_first_iterator
i
=
(
*
this
)
+
distance
;
return
*
i
;
}
ROCPRIM_HOST_DEVICE
inline
replace_first_iterator
operator
+
(
difference_type
distance
)
const
{
return
replace_first_iterator
(
iterator_
+
distance
,
value_
,
index_
+
distance
);
}
ROCPRIM_HOST_DEVICE
inline
replace_first_iterator
&
operator
+=
(
difference_type
distance
)
{
iterator_
+=
distance
;
index_
+=
distance
;
return
*
this
;
}
private:
InputIterator
iterator_
;
value_type
value_
;
size_t
index_
;
};
}
// end of detail namespace
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_ITERATOR_REPLACE_FIRST_ITERATOR_HPP_
Prev
1
…
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment