Commit f8a481f8 authored by zhouxiang's avatar zhouxiang
Browse files

添加dtk中的cub头文件

parent 7b7c64c5
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
#include "../config.hpp"
#include <cub/thread/thread_operators.cuh>
#include <cub/rocprim/device/device_adjacent_difference.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceAdjacentDifference
{
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT = ::cub::Difference>
static HIPCUB_RUNTIME_FUNCTION cudaError_t
SubtractLeftCopy(void *d_temp_storage,
std::size_t &temp_storage_bytes,
InputIteratorT d_input,
OutputIteratorT d_output,
std::size_t num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::adjacent_difference(
d_temp_storage, temp_storage_bytes, d_input, d_output,
num_items, difference_op, stream, debug_synchronous
);
}
template <typename RandomAccessIteratorT,
typename DifferenceOpT = ::cub::Difference>
static HIPCUB_RUNTIME_FUNCTION cudaError_t
SubtractLeft(void *d_temp_storage,
std::size_t &temp_storage_bytes,
RandomAccessIteratorT d_input,
std::size_t num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::adjacent_difference_inplace(
d_temp_storage, temp_storage_bytes, d_input,
num_items, difference_op, stream, debug_synchronous
);
}
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT = ::cub::Difference>
static HIPCUB_RUNTIME_FUNCTION cudaError_t
SubtractRightCopy(void *d_temp_storage,
std::size_t &temp_storage_bytes,
InputIteratorT d_input,
OutputIteratorT d_output,
std::size_t num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::adjacent_difference_right(
d_temp_storage, temp_storage_bytes, d_input, d_output,
num_items, difference_op, stream, debug_synchronous
);
}
template <typename RandomAccessIteratorT,
typename DifferenceOpT = ::cub::Difference>
static HIPCUB_RUNTIME_FUNCTION cudaError_t
SubtractRight(void *d_temp_storage,
std::size_t &temp_storage_bytes,
RandomAccessIteratorT d_input,
std::size_t num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::adjacent_difference_right_inplace(
d_temp_storage, temp_storage_bytes, d_input,
num_items, difference_op, stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_histogram.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceHistogram
{
template<
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t HistogramEven(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram,
int num_levels,
LevelT lower_level,
LevelT upper_level,
OffsetT num_samples,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::histogram_even(
d_temp_storage, temp_storage_bytes,
d_samples, num_samples,
d_histogram,
num_levels, lower_level, upper_level,
stream, debug_synchronous
);
}
template<
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t HistogramEven(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram,
int num_levels,
LevelT lower_level,
LevelT upper_level,
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::histogram_even(
d_temp_storage, temp_storage_bytes,
d_samples, num_row_samples, num_rows, row_stride_bytes,
d_histogram,
num_levels, lower_level, upper_level,
stream, debug_synchronous
);
}
template<
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t MultiHistogramEven(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT lower_level[NUM_ACTIVE_CHANNELS],
LevelT upper_level[NUM_ACTIVE_CHANNELS],
OffsetT num_pixels,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
unsigned int levels[NUM_ACTIVE_CHANNELS];
for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
{
levels[channel] = num_levels[channel];
}
return (cudaError_t)::rocprim::multi_histogram_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage, temp_storage_bytes,
d_samples, num_pixels,
d_histogram,
levels, lower_level, upper_level,
stream, debug_synchronous
);
}
template<
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t MultiHistogramEven(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT lower_level[NUM_ACTIVE_CHANNELS],
LevelT upper_level[NUM_ACTIVE_CHANNELS],
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
unsigned int levels[NUM_ACTIVE_CHANNELS];
for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
{
levels[channel] = num_levels[channel];
}
return (cudaError_t)::rocprim::multi_histogram_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage, temp_storage_bytes,
d_samples, num_row_pixels, num_rows, row_stride_bytes,
d_histogram,
levels, lower_level, upper_level,
stream, debug_synchronous
);
}
template<
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t HistogramRange(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram,
int num_levels,
LevelT * d_levels,
OffsetT num_samples,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::histogram_range(
d_temp_storage, temp_storage_bytes,
d_samples, num_samples,
d_histogram,
num_levels, d_levels,
stream, debug_synchronous
);
}
template<
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t HistogramRange(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram,
int num_levels,
LevelT * d_levels,
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::histogram_range(
d_temp_storage, temp_storage_bytes,
d_samples, num_row_samples, num_rows, row_stride_bytes,
d_histogram,
num_levels, d_levels,
stream, debug_synchronous
);
}
template<
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t MultiHistogramRange(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT * d_levels[NUM_ACTIVE_CHANNELS],
OffsetT num_pixels,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
unsigned int levels[NUM_ACTIVE_CHANNELS];
for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
{
levels[channel] = num_levels[channel];
}
return (cudaError_t)::rocprim::multi_histogram_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage, temp_storage_bytes,
d_samples, num_pixels,
d_histogram,
levels, d_levels,
stream, debug_synchronous
);
}
template<
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t MultiHistogramRange(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT * d_levels[NUM_ACTIVE_CHANNELS],
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
unsigned int levels[NUM_ACTIVE_CHANNELS];
for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
{
levels[channel] = num_levels[channel];
}
return (cudaError_t)::rocprim::multi_histogram_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage, temp_storage_bytes,
d_samples, num_row_pixels, num_rows, row_stride_bytes,
d_histogram,
levels, d_levels,
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_merge_sort.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceMergeSort
{
template<typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyIteratorT d_keys,
ValueIteratorT d_items,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
temp_storage_bytes,
d_keys,
d_keys,
d_items,
d_items,
num_items,
compare_op,
stream,
debug_synchronous);
}
template<typename KeyInputIteratorT,
typename ValueInputIteratorT,
typename KeyIteratorT,
typename ValueIteratorT,
typename OffsetT,
typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyInputIteratorT d_input_keys,
ValueInputIteratorT d_input_items,
KeyIteratorT d_output_keys,
ValueIteratorT d_output_items,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
temp_storage_bytes,
d_input_keys,
d_output_keys,
d_input_items,
d_output_items,
num_items,
compare_op,
stream,
debug_synchronous);
}
template<typename KeyIteratorT, typename OffsetT, typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyIteratorT d_keys,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(
d_temp_storage, temp_storage_bytes,
d_keys, d_keys, num_items,
compare_op, stream, debug_synchronous
);
}
template<typename KeyInputIteratorT,
typename KeyIteratorT,
typename OffsetT,
typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyInputIteratorT d_input_keys,
KeyIteratorT d_output_keys,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(
d_temp_storage, temp_storage_bytes,
d_input_keys, d_output_keys, num_items,
compare_op, stream, debug_synchronous
);
}
template <typename KeyIteratorT,
typename ValueIteratorT,
typename OffsetT,
typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t
StableSortPairs(void *d_temp_storage,
std::size_t &temp_storage_bytes,
KeyIteratorT d_keys,
ValueIteratorT d_items,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
temp_storage_bytes,
d_keys,
d_keys,
d_items,
d_items,
num_items,
compare_op,
stream,
debug_synchronous);
}
template<typename KeyIteratorT, typename OffsetT, typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyIteratorT d_keys,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(
d_temp_storage, temp_storage_bytes,
d_keys, d_keys, num_items,
compare_op, stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_PARTITION_HPP_
#define HIPCUB_ROCPRIM_DEVICE_PARTITION_HPP_
#include "../config.hpp"
#include <cub/rocprim/device/device_partition.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DevicePartition
{
template <
typename InputIteratorT,
typename FlagIterator,
typename OutputIteratorT,
typename NumSelectedIteratorT>
HIPCUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> hip stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return (cudaError_t)rocprim::partition(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
num_items,
stream,
debug_synchronous);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename SelectOp>
HIPCUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t If(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
SelectOp select_op, ///< [in] Unary selection operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> hip stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return (cudaError_t)rocprim::partition(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
d_num_selected_out,
num_items,
select_op,
stream,
debug_synchronous);
}
template <typename InputIteratorT,
typename FirstOutputIteratorT,
typename SecondOutputIteratorT,
typename UnselectedOutputIteratorT,
typename NumSelectedIteratorT,
typename SelectFirstPartOp,
typename SelectSecondPartOp>
HIPCUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
If(void *d_temp_storage,
std::size_t &temp_storage_bytes,
InputIteratorT d_in,
FirstOutputIteratorT d_first_part_out,
SecondOutputIteratorT d_second_part_out,
UnselectedOutputIteratorT d_unselected_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
SelectFirstPartOp select_first_part_op,
SelectSecondPartOp select_second_part_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)rocprim::partition_three_way(
d_temp_storage,
temp_storage_bytes,
d_in,
d_first_part_out,
d_second_part_out,
d_unselected_out,
d_num_selected_out,
num_items,
select_first_part_op,
select_second_part_op,
stream,
debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_radix_sort.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceRadixSort
{
template<typename KeyT, typename ValueT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename ValueT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
template<typename KeyT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
#include <limits>
#include <iterator>
#include <cuda_fp16.h> // __half
#include <thrust/system/cuda/cuda_bfloat16.h> // hip_bfloat16
#include "../config.hpp"
#include "../iterator/arg_index_input_iterator.cuh"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/device/device_reduce.hpp>
#include <cub/rocprim/device/device_reduce_by_key.hpp>
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
template<class T>
inline
T get_lowest_value()
{
return std::numeric_limits<T>::lowest();
}
template<>
inline
__half get_lowest_value<__half>()
{
unsigned short lowest_half = 0xfbff;
__half lowest_value = *reinterpret_cast<__half*>(&lowest_half);
return lowest_value;
}
template<>
inline
cuda_bfloat16 get_lowest_value<cuda_bfloat16>()
{
return cuda_bfloat16(-3.38953138925e+38f);
}
template<class T>
inline
T get_max_value()
{
return std::numeric_limits<T>::max();
}
template<>
inline
__half get_max_value<__half>()
{
unsigned short max_half = 0x7bff;
__half max_value = *reinterpret_cast<__half*>(&max_half);
return max_value;
}
template<>
inline
cuda_bfloat16 get_max_value<cuda_bfloat16>()
{
return cuda_bfloat16(3.38953138925e+38f);
}
} // end detail namespace
class DeviceReduce
{
public:
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ReduceOpT,
typename T
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Reduce(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
ReduceOpT reduction_op,
T init,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out, init, num_items,
::cub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Sum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using T = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, ::cub::Sum(), T(0),
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Min(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using T = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, ::cub::Min(), detail::get_max_value<T>(),
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ArgMin(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using OffsetT = int;
using T = typename std::iterator_traits<InputIteratorT>::value_type;
using O = typename std::iterator_traits<OutputIteratorT>::value_type;
using OutputTupleT =
typename std::conditional<
std::is_same<O, void>::value,
KeyValuePair<OffsetT, T>,
O
>::type;
using OutputValueT = typename OutputTupleT::Value;
using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
IteratorT d_indexed_in(d_in);
OutputTupleT init(1, detail::get_max_value<T>());
return Reduce(
d_temp_storage, temp_storage_bytes,
d_indexed_in, d_out, num_items, ::cub::ArgMin(), init,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Max(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using T = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, ::cub::Max(), detail::get_lowest_value<T>(),
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ArgMax(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using OffsetT = int;
using T = typename std::iterator_traits<InputIteratorT>::value_type;
using O = typename std::iterator_traits<OutputIteratorT>::value_type;
using OutputTupleT =
typename std::conditional<
std::is_same<O, void>::value,
KeyValuePair<OffsetT, T>,
O
>::type;
using OutputValueT = typename OutputTupleT::Value;
using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
IteratorT d_indexed_in(d_in);
OutputTupleT init(1, detail::get_lowest_value<T>());
return Reduce(
d_temp_storage, temp_storage_bytes,
d_indexed_in, d_out, num_items, ::cub::ArgMax(), init,
stream, debug_synchronous
);
}
template<
typename KeysInputIteratorT,
typename UniqueOutputIteratorT,
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename ReductionOpT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ReduceByKey(void * d_temp_storage,
size_t& temp_storage_bytes,
KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
ReductionOpT reduction_op,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using key_compare_op =
::rocprim::equal_to<typename std::iterator_traits<KeysInputIteratorT>::value_type>;
return (cudaError_t)::rocprim::reduce_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, num_items,
d_unique_out, d_aggregates_out, d_num_runs_out,
::cub::detail::convert_result_type<ValuesInputIteratorT, AggregatesOutputIteratorT>(reduction_op),
key_compare_op(),
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
#include "../config.hpp"
#include <cub/rocprim/device/device_run_length_encode.hpp>
BEGIN_HIPCUB_NAMESPACE
class DeviceRunLengthEncode
{
public:
template<
typename InputIteratorT,
typename UniqueOutputIteratorT,
typename LengthsOutputIteratorT,
typename NumRunsOutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Encode(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
UniqueOutputIteratorT d_unique_out,
LengthsOutputIteratorT d_counts_out,
NumRunsOutputIteratorT d_num_runs_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::run_length_encode(
d_temp_storage, temp_storage_bytes,
d_in, num_items,
d_unique_out, d_counts_out, d_num_runs_out,
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OffsetsOutputIteratorT,
typename LengthsOutputIteratorT,
typename NumRunsOutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t NonTrivialRuns(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsOutputIteratorT d_num_runs_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::run_length_encode_non_trivial_runs(
d_temp_storage, temp_storage_bytes,
d_in, num_items,
d_offsets_out, d_lengths_out, d_num_runs_out,
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
#include <iostream>
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/device/device_scan.hpp>
#include <cub/rocprim/device/device_scan_by_key.hpp>
BEGIN_HIPCUB_NAMESPACE
class DeviceScan
{
public:
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t InclusiveSum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
size_t num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return InclusiveScan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, ::cub::Sum(), num_items,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t InclusiveScan(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
size_t num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::inclusive_scan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items,
scan_op,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveSum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
size_t num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using T = typename std::iterator_traits<InputIteratorT>::value_type;
return ExclusiveScan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, ::cub::Sum(), T(0), num_items,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT,
typename InitValueT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveScan(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
InitValueT init_value,
size_t num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::exclusive_scan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, init_value, num_items,
scan_op,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT,
typename InitValueT,
typename InitValueIterT = InitValueT*
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveScan(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
FutureValue<InitValueT, InitValueIterT> init_value,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::exclusive_scan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, init_value, num_items,
scan_op,
stream, debug_synchronous
);
}
template <
typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename EqualityOpT = ::cub::Equality
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveSumByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
int num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using in_value_type = typename std::iterator_traits<ValuesInputIteratorT>::value_type;
return (cudaError_t)::rocprim::exclusive_scan_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, d_values_out,
static_cast<in_value_type>(0), static_cast<size_t>(num_items),
::cub::Sum(), equality_op, stream, debug_synchronous
);
}
template <
typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename ScanOpT,
typename InitValueT,
typename EqualityOpT = ::cub::Equality
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveScanByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
ScanOpT scan_op,
InitValueT init_value,
int num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::exclusive_scan_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, d_values_out,
init_value, static_cast<size_t>(num_items),
scan_op, equality_op, stream, debug_synchronous
);
}
template <
typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename EqualityOpT = ::cub::Equality
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t InclusiveSumByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
int num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::inclusive_scan_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, d_values_out,
static_cast<size_t>(num_items), ::cub::Sum(),
equality_op, stream, debug_synchronous
);
}
template <
typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename ScanOpT,
typename EqualityOpT = ::cub::Equality
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t InclusiveScanByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
ScanOpT scan_op,
int num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::inclusive_scan_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, d_values_out,
static_cast<size_t>(num_items), scan_op,
equality_op, stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_segmented_radix_sort.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceSegmentedRadixSort
{
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
#include <limits>
#include <iterator>
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include "../iterator/arg_index_input_iterator.cuh"
#include <cub/rocprim/device/device_segmented_reduce.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceSegmentedReduce
{
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT,
typename ReductionOp,
typename T
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Reduce(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
ReductionOp reduction_op,
T initial_value,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
initial_value,
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Sum(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::Sum(), input_type(),
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Min(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::Min(), std::numeric_limits<input_type>::max(),
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ArgMin(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using OffsetT = int;
using T = typename std::iterator_traits<InputIteratorT>::value_type;
using O = typename std::iterator_traits<OutputIteratorT>::value_type;
using OutputTupleT = typename std::conditional<
std::is_same<O, void>::value,
KeyValuePair<OffsetT, T>,
O
>::type;
using OutputValueT = typename OutputTupleT::Value;
using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
IteratorT d_indexed_in(d_in);
const OutputTupleT init(1, std::numeric_limits<T>::max());
return Reduce(
d_temp_storage, temp_storage_bytes,
d_indexed_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::ArgMin(), init,
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Max(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::Max(), std::numeric_limits<input_type>::lowest(),
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ArgMax(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using OffsetT = int;
using T = typename std::iterator_traits<InputIteratorT>::value_type;
using O = typename std::iterator_traits<OutputIteratorT>::value_type;
using OutputTupleT = typename std::conditional<
std::is_same<O, void>::value,
KeyValuePair<OffsetT, T>,
O
>::type;
using OutputValueT = typename OutputTupleT::Value;
using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
IteratorT d_indexed_in(d_in);
const OutputTupleT init(1, std::numeric_limits<T>::lowest());
return Reduce(
d_temp_storage, temp_storage_bytes,
d_indexed_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::ArgMax(), init,
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_segmented_radix_sort.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceSegmentedSort
{
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortPairs(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortPairs(
d_temp_storage, temp_storage_bytes,
d_keys, d_values, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortPairsDescending(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortPairsDescending(
d_temp_storage, temp_storage_bytes,
d_keys, d_values, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortKeys(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortKeys(
d_temp_storage, temp_storage_bytes,
d_keys, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortKeysDescending(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortKeysDescending(
d_temp_storage, temp_storage_bytes,
d_keys, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/device/device_select.hpp>
BEGIN_HIPCUB_NAMESPACE
class DeviceSelect
{
public:
template <
typename InputIteratorT,
typename FlagIterator,
typename OutputIteratorT,
typename NumSelectedIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Flagged(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
FlagIterator d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::select(
d_temp_storage, temp_storage_bytes,
d_in, d_flags, d_out, d_num_selected_out, num_items,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename SelectOp
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t If(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
SelectOp select_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::select(
d_temp_storage, temp_storage_bytes,
d_in, d_out, d_num_selected_out, num_items, select_op,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Unique(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::unique(
d_temp_storage, temp_storage_bytes,
d_in, d_out, d_num_selected_out, num_items, cub::Equality(),
stream, debug_synchronous
);
}
template <
typename KeyIteratorT,
typename ValueIteratorT,
typename OutputKeyIteratorT,
typename OutputValueIteratorT,
typename NumSelectedIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t UniqueByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeyIteratorT d_keys_input,
ValueIteratorT d_values_input,
OutputKeyIteratorT d_keys_output,
OutputValueIteratorT d_values_output,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::unique_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_input, d_values_input,
d_keys_output, d_values_output,
d_num_selected_out, num_items, cub::Equality(),
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_
#include "../config.hpp"
#include "../iterator/tex_ref_input_iterator.cuh"
BEGIN_HIPCUB_NAMESPACE
class DeviceSpmv
{
public:
template <
typename ValueT, ///< Matrix and vector value type
typename OffsetT> ///< Signed integer type for sequence offsets
struct SpmvParams
{
ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
int num_rows; ///< Number of rows of matrix <b>A</b>.
int num_cols; ///< Number of columns of matrix <b>A</b>.
int num_nonzeros; ///< Number of nonzero elements of matrix <b>A</b>.
ValueT alpha; ///< Alpha multiplicand
ValueT beta; ///< Beta addend-multiplicand
::cub::TexRefInputIterator<ValueT, 66778899, OffsetT> t_vector_x;
};
static constexpr uint32_t CsrMVKernel_MaxThreads = 256;
template <typename ValueT>
static __global__ void
CsrMVKernel(SpmvParams<ValueT, int> spmv_params)
{
__shared__ ValueT partial;
const int32_t row_id = hipBlockIdx_x;
if(threadIdx.x == 0)
{
partial = spmv_params.beta * spmv_params.d_vector_y[row_id];
}
__syncthreads();
int32_t row_offset = (row_id == 0) ? (0) : (spmv_params.d_row_end_offsets[row_id - 1]);
for(uint32_t thread_offset = 0; thread_offset < spmv_params.num_cols / blockDim.x; thread_offset++)
{
int32_t offset = row_offset + thread_offset * blockDim.x + threadIdx.x;
if(offset < spmv_params.d_row_end_offsets[row_id])
{
ValueT t_value =
spmv_params.alpha *
spmv_params.d_values[offset] *
spmv_params.d_vector_x[spmv_params.d_column_indices[offset]];
atomicAdd(&partial, t_value);
__syncthreads();
iif(threadIdx.x == 0)
{
spmv_params.d_vector_y[row_id] = partial;
}
}
}
}
template <typename ValueT>
HIPCUB_RUNTIME_FUNCTION
static cudaError_t CsrMV(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
int num_rows, ///< [in] number of rows of matrix <b>A</b>.
int num_cols, ///< [in] number of columns of matrix <b>A</b>.
int num_nonzeros, ///< [in] number of nonzero elements of matrix <b>A</b>.
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> hip stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
SpmvParams<ValueT, int> spmv_params;
spmv_params.d_values = d_values;
spmv_params.d_row_end_offsets = d_row_offsets + 1;
spmv_params.d_column_indices = d_column_indices;
spmv_params.d_vector_x = d_vector_x;
spmv_params.d_vector_y = d_vector_y;
spmv_params.num_rows = num_rows;
spmv_params.num_cols = num_cols;
spmv_params.num_nonzeros = num_nonzeros;
spmv_params.alpha = 1.0;
spmv_params.beta = 0.0;
cudaError_t status;
if(d_temp_storage == nullptr)
{
// Make sure user won't try to allocate 0 bytes memory, because
// hipMalloc will return nullptr when size is zero.
temp_storage_bytes = 4;
return cudaError_t(0);
}
else
{
size_t block_size = min(num_cols, DeviceSpmv::CsrMVKernel_MaxThreads);
size_t grid_size = num_rows;
CsrMVKernel<<<grid_size, block_size, 0, stream>>>(spmv_params);
status = hipGetLastError();
}
return status;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_CUB_DEVICE_DEVICE_SELECT_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
#define HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../thread/thread_load.cuh"
BEGIN_HIPCUB_NAMESPACE
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridBarrier implements a software global barrier among thread blocks within a cuda grid
*/
class GridBarrier
{
protected :
typedef unsigned int SyncFlag;
// Counters in global device memory
SyncFlag* d_sync;
public:
/**
* Constructor
*/
GridBarrier() : d_sync(NULL) {}
/**
* @typedef SyncFlag
* @brief Synchronize
*/
__device__ __forceinline__ void Sync() const
{
volatile SyncFlag *d_vol_sync = d_sync;
// Threadfence and syncthreads to make sure global writes are visible before
// thread-0 reports in with its sync counter
__threadfence();
__syncthreads();
if (blockIdx.x == 0)
{
// Report in ourselves
if (threadIdx.x == 0)
{
d_vol_sync[blockIdx.x] = 1;
}
__syncthreads();
// Wait for everyone else to report in
for (uint32_t peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
{
while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
{
__threadfence_block();
}
}
__syncthreads();
// Let everyone know it's safe to proceed
for (uint32_t peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
{
d_vol_sync[peer_block] = 0;
}
}
else
{
if (threadIdx.x == 0)
{
// Report in
d_vol_sync[blockIdx.x] = 1;
// Wait for acknowledgment
while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
{
__threadfence_block();
}
}
__syncthreads();
}
}
};
/**
* \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
*
* Uses RAII for lifetime, i.e., device resources are reclaimed when
* the destructor is called.
*/
class GridBarrierLifetime : public GridBarrier
{
protected:
// Number of bytes backed by d_sync
size_t sync_bytes;
public:
/**
* Constructor
*/
GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
/**
* DeviceFrees and resets the progress counters
*/
cudaError_t HostReset()
{
cudaError_t retval = cudaSuccess;
if (d_sync)
{
retval = cudaFree(d_sync);
d_sync = NULL;
}
sync_bytes = 0;
return retval;
}
/**
* Destructor
*/
virtual ~GridBarrierLifetime()
{
HostReset();
}
/**
* Sets up the progress counters for the next kernel launch (lazily
* allocating and initializing them if necessary)
*/
cudaError_t Setup(int sweep_grid_size)
{
cudaError_t retval = cudaSuccess;
do {
size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
if (new_sync_bytes > sync_bytes)
{
if (d_sync)
{
if ((retval = cudaFree(d_sync))) break;
}
sync_bytes = new_sync_bytes;
// Allocate and initialize to zero
if ((retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
if ((retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
}
} while (0);
return retval;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
#define HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "grid_mapping.cuh"
#include "../util_type.cuh"
BEGIN_HIPCUB_NAMESPACE
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridEvenShare is a descriptor utility for distributing input among
* CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly
* the same number of input tiles.
*
* \par Overview
* Each thread block is assigned a consecutive sequence of input tiles. To help
* preserve alignment and eliminate the overhead of guarded loads for all but the
* last thread block, to GridEvenShare assigns one of three different amounts of
* work to a given thread block: "big", "normal", or "last". The "big" workloads
* are one scheduling grain larger than "normal". The "last" work unit for the
* last thread block may be partially-full if the input is not an even multiple of
* the scheduling grain size.
*
* \par
* Before invoking a child grid, a parent thread will typically construct an
* instance of GridEvenShare. The instance can be passed to child thread blocks
* which can initialize their per-thread block offsets using \p BlockInit().
*/
template <typename OffsetT>
struct GridEvenShare
{
private:
int total_tiles;
int big_shares;
OffsetT big_share_items;
OffsetT normal_share_items;
OffsetT normal_base_offset;
public:
/// Total number of input items
OffsetT num_items;
/// Grid size in thread blocks
int grid_size;
/// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
OffsetT block_offset;
/// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
OffsetT block_end;
/// Stride between input tiles
OffsetT block_stride;
/**
* \brief Constructor.
*/
__host__ __device__ __forceinline__ GridEvenShare() :
total_tiles(0),
big_shares(0),
big_share_items(0),
normal_share_items(0),
normal_base_offset(0),
num_items(0),
grid_size(0),
block_offset(0),
block_end(0),
block_stride(0)
{}
/**
* \brief Dispatch initializer. To be called prior to kernel launch.
*/
__host__ __device__ __forceinline__ void DispatchInit(
OffsetT num_items_, ///< Total number of input items
int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
int tile_items) ///< Number of data items per input tile
{
this->block_offset = num_items_; // Initialize past-the-end
this->block_end = num_items_; // Initialize past-the-end
this->num_items = num_items_;
this->total_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items_, tile_items));
this->grid_size = min(total_tiles, max_grid_size);
int avg_tiles_per_block = total_tiles / grid_size;
// leftover grains go to big blocks:
this->big_shares = total_tiles - (avg_tiles_per_block * grid_size);
this->normal_share_items = avg_tiles_per_block * tile_items;
this->normal_base_offset = big_shares * tile_items;
this->big_share_items = normal_share_items + tile_items;
}
/**
* \brief Initializes ranges for the specified thread block index. Specialized
* for a "raking" access pattern in which each thread block is assigned a
* consecutive sequence of input tiles.
*/
template <int TILE_ITEMS>
__device__ __forceinline__ void BlockInit(
int block_id,
Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
{
block_stride = TILE_ITEMS;
if (block_id < big_shares)
{
// This thread block gets a big share of grains (avg_tiles_per_block + 1)
block_offset = (block_id * big_share_items);
block_end = block_offset + big_share_items;
}
else if (block_id < total_tiles)
{
// This thread block gets a normal share of grains (avg_tiles_per_block)
block_offset = normal_base_offset + (block_id * normal_share_items);
block_end = min(num_items, block_offset + normal_share_items);
}
// Else default past-the-end
}
/**
* \brief Block-initialization, specialized for a "raking" access
* pattern in which each thread block is assigned a consecutive sequence
* of input tiles.
*/
template <int TILE_ITEMS>
__device__ __forceinline__ void BlockInit(
int block_id,
Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
{
block_stride = grid_size * TILE_ITEMS;
block_offset = (block_id * TILE_ITEMS);
block_end = num_items;
}
/**
* \brief Block-initialization, specialized for "strip mining" access
* pattern in which the input tiles assigned to each thread block are
* separated by a stride equal to the the extent of the grid.
*/
template <
int TILE_ITEMS,
GridMappingStrategy STRATEGY>
__device__ __forceinline__ void BlockInit()
{
BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
}
/**
* \brief Block-initialization, specialized for a "raking" access
* pattern in which each thread block is assigned a consecutive sequence
* of input tiles.
*/
template <int TILE_ITEMS>
__device__ __forceinline__ void BlockInit(
OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
OffsetT block_end) ///< [in] Threadblock end offset (exclusive)
{
this->block_offset = block_offset;
this->block_end = block_end;
this->block_stride = TILE_ITEMS;
}
};
/** @} */ // end group GridModule
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
#define HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../thread/thread_load.cuh"
BEGIN_HIPCUB_NAMESPACE
/**
* \addtogroup GridModule
* @{
*/
/******************************************************************************
* Mapping policies
*****************************************************************************/
/**
* \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
*/
enum GridMappingStrategy
{
/**
* \brief An a "raking" access pattern in which each thread block is
* assigned a consecutive sequence of input tiles
*
* \par Overview
* The input is evenly partitioned into \p p segments, where \p p is
* constant and corresponds loosely to the number of thread blocks that may
* actively reside on the target device. Each segment is comprised of
* consecutive tiles, where a tile is a small, constant-sized unit of input
* to be processed to completion before the thread block terminates or
* obtains more work. The kernel invokes \p p thread blocks, each
* of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
* in tile-size increments.
*/
GRID_MAPPING_RAKE,
/**
* \brief An a "strip mining" access pattern in which the input tiles assigned
* to each thread block are separated by a stride equal to the the extent of
* the grid.
*
* \par Overview
* The input is evenly partitioned into \p p sets, where \p p is
* constant and corresponds loosely to the number of thread blocks that may
* actively reside on the target device. Each set is comprised of
* data tiles separated by stride \p tiles, where a tile is a small,
* constant-sized unit of input to be processed to completion before the
* thread block terminates or obtains more work. The kernel invokes \p p
* thread blocks, each of which iteratively consumes a segment of
* <em>n</em>/<em>p</em> elements in tile-size increments.
*/
GRID_MAPPING_STRIP_MINE,
/**
* \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
*
* \par Overview
* The input is treated as a queue to be dynamically consumed by a grid of
* thread blocks. Work is atomically dequeued in tiles, where a tile is a
* unit of input to be processed to completion before the thread block
* terminates or obtains more work. The grid size \p p is constant,
* loosely corresponding to the number of thread blocks that may actively
* reside on the target device.
*/
GRID_MAPPING_DYNAMIC,
};
/** @} */ // end group GridModule
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
#define HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
#include <type_traits>
#include "../config.hpp"
BEGIN_HIPCUB_NAMESPACE
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridQueue is a descriptor utility for dynamic queue management.
*
* \par Overview
* GridQueue descriptors provides abstractions for "filling" or
* "draining" globally-shared vectors.
*
* \par
* A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
* returning a unique offset for the calling thread to write its items.
* The GridQueue maintains the total "fill-size". The fill counter must be reset
* using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
* will be filling.
*
* \par
* Similarly, a "draining" GridQueue works by works by atomically-incrementing a
* zero-initialized counter, returning a unique offset for the calling thread to
* read its items. Threads can safely drain until the array's logical fill-size is
* exceeded. The drain counter must be reset using GridQueue::ResetDrain or
* GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
* will be filling. (For dynamic work distribution of existing data, the corresponding fill-size
* is simply the number of elements in the array.)
*
* \par
* Iterative work management can be implemented simply with a pair of flip-flopping
* work buffers, each with an associated set of fill and drain GridQueue descriptors.
*
* \tparam OffsetT Signed integer type for global offsets
*/
template <typename OffsetT>
class GridQueue
{
private:
/// Counter indices
enum
{
FILL = 0,
DRAIN = 1,
};
/// Pair of counters
OffsetT *d_counters;
public:
/// Returns the device allocation size in bytes needed to construct a GridQueue instance
__host__ __device__ __forceinline__
static size_t AllocationSize()
{
return sizeof(OffsetT) * 2;
}
/// Constructs an invalid GridQueue descriptor
__host__ __device__ __forceinline__ GridQueue()
:
d_counters(NULL)
{}
/// Constructs a GridQueue descriptor around the device storage allocation
__host__ __device__ __forceinline__ GridQueue(
void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as <tt>AllocationSize()</tt>.
:
d_counters((OffsetT*) d_storage)
{}
/// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining.
HIPCUB_DEVICE cudaError_t FillAndResetDrain(
OffsetT fill_size,
cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
(void)stream;
d_counters[FILL] = fill_size;
d_counters[DRAIN] = 0;
result = cudaSuccess;
return result;
}
HIPCUB_HOST cudaError_t FillAndResetDrain(
OffsetT fill_size,
cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
OffsetT counters[2];
counters[FILL] = fill_size;
counters[DRAIN] = 0;
result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
return result;
}
/// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining.
HIPCUB_DEVICE cudaError_t ResetDrain(cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
(void)stream;
d_counters[DRAIN] = 0;
result = cudaSuccess;
return result;
}
HIPCUB_HOST cudaError_t ResetDrain(cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
return result;
}
/// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling.
HIPCUB_DEVICE cudaError_t ResetFill(cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
(void)stream;
d_counters[FILL] = 0;
result = cudaSuccess;
return result;
}
HIPCUB_HOST cudaError_t ResetFill(cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
return result;
}
/// Returns the fill-size established by the parent or by the previous kernel.
HIPCUB_DEVICE cudaError_t FillSize(
OffsetT &fill_size,
cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
(void)stream;
fill_size = d_counters[FILL];
result = cudaSuccess;
return result;
}
HIPCUB_HOST cudaError_t FillSize(
OffsetT &fill_size,
cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
return result;
}
/// Drain \p num_items from the queue. Returns offset from which to read items. To be called from cuda kernel.
HIPCUB_DEVICE OffsetT Drain(OffsetT num_items)
{
return atomicAdd(d_counters + DRAIN, num_items);
}
/// Fill \p num_items into the queue. Returns offset from which to write items. To be called from cuda kernel.
HIPCUB_DEVICE OffsetT Fill(OffsetT num_items)
{
return atomicAdd(d_counters + FILL, num_items);
}
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Reset grid queue (call with 1 block of 1 thread)
*/
template <typename OffsetT>
__global__ void FillAndResetDrainKernel(
GridQueue<OffsetT> grid_queue,
OffsetT num_items)
{
grid_queue.FillAndResetDrain(num_items);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/** @} */ // end group GridModule
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/arg_index_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template<
typename InputIterator,
typename Difference = std::ptrdiff_t,
typename Value = typename std::iterator_traits<InputIterator>::value_type
>
using ArgIndexInputIterator = ::rocprim::arg_index_iterator<InputIterator, Difference, Value>;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../thread/thread_load.cuh"
#include "../util_type.cuh"
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
template <
CacheLoadModifier MODIFIER,
typename ValueType,
typename OffsetT = ptrdiff_t>
class CacheModifiedInputIterator
{
public:
// Required iterator traits
typedef CacheModifiedInputIterator self_type; ///< My own type
typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
typedef ValueType value_type; ///< The type of the element the iterator can point to
typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
typedef ValueType reference; ///< The type of a reference to an element the iterator can point to
typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
public:
/// Wrapped native pointer
ValueType* ptr;
/// Constructor
__host__ __device__ __forceinline__ CacheModifiedInputIterator(
ValueType* ptr) ///< Native pointer to wrap
:
ptr(const_cast<typename std::remove_cv<ValueType>::type *>(ptr))
{}
/// Postfix increment
__host__ __device__ __forceinline__ self_type operator++(int)
{
self_type retval = *this;
ptr++;
return retval;
}
/// Prefix increment
__host__ __device__ __forceinline__ self_type operator++()
{
ptr++;
return *this;
}
/// Indirection
__device__ __forceinline__ reference operator*() const
{
return ThreadLoad<MODIFIER>(ptr);
}
/// Addition
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator+(Distance n) const
{
self_type retval(ptr + n);
return retval;
}
/// Addition assignment
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator+=(Distance n)
{
ptr += n;
return *this;
}
/// Subtraction
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator-(Distance n) const
{
self_type retval(ptr - n);
return retval;
}
/// Subtraction assignment
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator-=(Distance n)
{
ptr -= n;
return *this;
}
/// Distance
__host__ __device__ __forceinline__ difference_type operator-(self_type other) const
{
return ptr - other.ptr;
}
/// Array subscript
template <typename Distance>
__device__ __forceinline__ reference operator[](Distance n) const
{
return ThreadLoad<MODIFIER>(ptr + n);
}
/// Structure dereference
__device__ __forceinline__ pointer operator->()
{
return &ThreadLoad<MODIFIER>(ptr);
}
/// Equal to
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (ptr == rhs.ptr);
}
/// Not equal to
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (ptr != rhs.ptr);
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// ostream operator
friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
{
return os;
}
#endif
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_OUTPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_OUTPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../thread/thread_load.cuh"
#include "../thread/thread_store.cuh"
#include "../util_type.cuh"
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
template <
CacheStoreModifier MODIFIER,
typename ValueType,
typename OffsetT = ptrdiff_t>
class CacheModifiedOutputIterator
{
private:
// Proxy object
struct Reference
{
ValueType* ptr;
/// Constructor
__host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
/// Assignment
__device__ __forceinline__ ValueType operator =(ValueType val)
{
ThreadStore<MODIFIER>(ptr, val);
return val;
}
};
public:
// Required iterator traits
typedef CacheModifiedOutputIterator self_type; ///< My own type
typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
typedef void value_type; ///< The type of the element the iterator can point to
typedef void pointer; ///< The type of a pointer to an element the iterator can point to
typedef Reference reference; ///< The type of a reference to an element the iterator can point to
typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
private:
ValueType* ptr;
public:
/// Constructor
template <typename QualifiedValueType>
__host__ __device__ __forceinline__ CacheModifiedOutputIterator(
QualifiedValueType* ptr) ///< Native pointer to wrap
:
ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type *>(ptr))
{}
/// Postfix increment
__host__ __device__ __forceinline__ self_type operator++(int)
{
self_type retval = *this;
ptr++;
return retval;
}
/// Prefix increment
__host__ __device__ __forceinline__ self_type operator++()
{
ptr++;
return *this;
}
/// Indirection
__host__ __device__ __forceinline__ reference operator*() const
{
return Reference(ptr);
}
/// Addition
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator+(Distance n) const
{
self_type retval(ptr + n);
return retval;
}
/// Addition assignment
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator+=(Distance n)
{
ptr += n;
return *this;
}
/// Subtraction
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator-(Distance n) const
{
self_type retval(ptr - n);
return retval;
}
/// Subtraction assignment
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator-=(Distance n)
{
ptr -= n;
return *this;
}
/// Distance
__host__ __device__ __forceinline__ difference_type operator-(self_type other) const
{
return ptr - other.ptr;
}
/// Array subscript
template <typename Distance>
__host__ __device__ __forceinline__ reference operator[](Distance n) const
{
return Reference(ptr + n);
}
/// Equal to
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (ptr == rhs.ptr);
}
/// Not equal to
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (ptr != rhs.ptr);
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// ostream operator
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
(void)itr;
return os;
}
#endif
};
END_HIPCUB_NAMESPACE
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment