Commit 0211193c authored by zhuwenwen's avatar zhuwenwen
Browse files

initial llama

parents
Pipeline #509 failed with stages
in 0 seconds
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
#include "../config.hpp"
#include "../thread/thread_search.cuh"
#include "../util_math.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"
#include "block_scan.cuh"
#include <limits>
#include <type_traits>
BEGIN_HIPCUB_NAMESPACE
/**
* \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given
* the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
* array.
* Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
* array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows
* retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
* DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
*
* \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
* A run of length zero may not be followed by a run length that is not zero.
*
* \par
* \code
* __global__ void ExampleKernel(...)
* {
* // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
* using RunItemT = uint64_t;
* // Type large enough to index into the run-length decoded array
* using RunLengthT = uint32_t;
*
* // Specialising BlockRunLengthDecode for a 1D block of 128 threads
* constexpr int BLOCK_DIM_X = 128;
* // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
* constexpr int RUNS_PER_THREAD = 2;
* // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
* constexpr int DECODED_ITEMS_PER_THREAD = 4;
*
* // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
* using BlockRunLengthDecodeT =
* cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
*
* // Allocate shared memory for BlockRunLengthDecode
* __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
*
* // The run-length encoded items and how often they shall be repeated in the run-length decoded output
* RunItemT run_values[RUNS_PER_THREAD];
* RunLengthT run_lengths[RUNS_PER_THREAD];
* ...
*
* // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
* uint32_t total_decoded_size = 0;
* BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
*
* // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
* // have been decoded.
* uint32_t decoded_window_offset = 0U;
* while (decoded_window_offset < total_decoded_size)
* {
* RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
* RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
*
* // The number of decoded items that are valid within this window (aka pass) of run-length decoding
* uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
* block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
*
* decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
*
* ...
* }
* }
* \endcode
* \par
* Suppose the set of input \p run_values across the block of threads is
* <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
* \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
* The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
* [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
* 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
*
* \tparam ItemT The data type of the items being run-length decoded
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
* \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
* \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
* runs' lengths)
* \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
* \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
*/
template <typename ItemT,
int BLOCK_DIM_X,
int RUNS_PER_THREAD,
int DECODED_ITEMS_PER_THREAD,
typename DecodedOffsetT = uint32_t,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1>
class BlockRunLengthDecode
{
//---------------------------------------------------------------------
// CONFIGS & TYPE ALIASES
//---------------------------------------------------------------------
private:
/// The thread block size in threads
static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
/// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
/// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_WARP_SCANS, BLOCK_DIM_Y, BLOCK_DIM_Z>;
/// Type used to index into the block's runs
using RunOffsetT = uint32_t;
/// Shared memory type required by this thread block
union _TempStorage
{
typename RunOffsetScanT::TempStorage offset_scan;
struct
{
ItemT run_values[BLOCK_RUNS];
DecodedOffsetT run_offsets[BLOCK_RUNS];
} runs;
}; // union TempStorage
/// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
HIPCUB_DEVICE __forceinline__ _TempStorage &PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
uint32_t linear_tid;
public:
struct TempStorage : Uninitialized<_TempStorage>
{
};
//---------------------------------------------------------------------
// CONSTRUCTOR
//---------------------------------------------------------------------
/**
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent
* <b>RunLengthDecode</b> calls.
*/
template <typename RunLengthT, typename TotalDecodedSizeT>
HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
: temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunLengths(run_values, run_lengths, total_decoded_size);
}
/**
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent
* <b>RunLengthDecode</b> calls.
*/
template <typename UserRunOffsetT>
HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
ItemT (&run_values)[RUNS_PER_THREAD],
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
: temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunOffsets(run_values, run_offsets);
}
/**
* \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
*/
template <typename RunLengthT, typename TotalDecodedSizeT>
HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
: temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunLengths(run_values, run_lengths, total_decoded_size);
}
/**
* \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
*/
template <typename UserRunOffsetT>
HIPCUB_DEVICE __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
: temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunOffsets(run_values, run_offsets);
}
private:
/**
* \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes
* \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search
* iterations at compile time.
*/
template <int MAX_NUM_ITEMS,
typename InputIteratorT,
typename OffsetT,
typename T>
HIPCUB_DEVICE __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence
OffsetT num_items, ///< [in] Input sequence length
T val) ///< [in] Search key
{
OffsetT lower_bound = 0;
OffsetT upper_bound = num_items;
#pragma unroll
for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
{
OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
mid = (rocprim::min)(mid, num_items - 1);
if (val < input[mid])
{
upper_bound = mid;
}
else
{
lower_bound = mid + 1;
}
}
return lower_bound;
}
template <typename RunOffsetT>
HIPCUB_DEVICE __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD],
RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
{
// Keep the runs' items and the offsets of each run's beginning in the temporary storage
RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
#pragma unroll
for (int i = 0; i < RUNS_PER_THREAD; i++)
{
temp_storage.runs.run_values[thread_dst_offset] = run_values[i];
temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
thread_dst_offset++;
}
// Ensure run offsets and run values have been writen to shared memory
CTA_SYNC();
}
template <typename RunLengthT, typename TotalDecodedSizeT>
HIPCUB_DEVICE __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
{
// Compute the offset for the beginning of each run
DecodedOffsetT run_offsets[RUNS_PER_THREAD];
#pragma unroll
for (int i = 0; i < RUNS_PER_THREAD; i++)
{
run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
}
DecodedOffsetT decoded_size_aggregate;
RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
// Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
CTA_SYNC();
InitWithRunOffsets(run_values, run_offsets);
}
public:
/**
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
* <b>RunLengthDecode</b> is not required.
* \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
* run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
* decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
* \smemreuse
*
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
* \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
* in undefined behavior.
*/
template <typename RelativeOffsetT>
HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
DecodedOffsetT from_decoded_offset = 0)
{
// The (global) offset of the first item decoded by this thread
DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
// The run that the first decoded item of this thread belongs to
// If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
// last run
RunOffsetT assigned_run =
StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) -
static_cast<RunOffsetT>(1U);
DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
: temp_storage.runs.run_offsets[assigned_run + 1];
ItemT val = temp_storage.runs.run_values[assigned_run];
#pragma unroll
for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
{
decoded_items[i] = val;
item_offsets[i] = thread_decoded_offset - assigned_run_begin;
if (thread_decoded_offset == assigned_run_end - 1)
{
// We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
// extending the last run's length to all the thread's item
assigned_run++;
assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
: temp_storage.runs.run_offsets[assigned_run + 1];
val = temp_storage.runs.run_values[assigned_run];
}
thread_decoded_offset++;
}
}
/**
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
* <b>RunLengthDecode</b> is not required.
*
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
* in undefined behavior.
*/
HIPCUB_DEVICE __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
DecodedOffsetT from_decoded_offset = 0)
{
DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/block/block_scan.hpp>
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
inline constexpr
typename std::underlying_type<::rocprim::block_scan_algorithm>::type
to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm v)
{
using utype = std::underlying_type<::rocprim::block_scan_algorithm>::type;
return static_cast<utype>(v);
}
}
enum BlockScanAlgorithm
{
BLOCK_SCAN_RAKING
= detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::reduce_then_scan),
BLOCK_SCAN_RAKING_MEMOIZE
= detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::reduce_then_scan),
BLOCK_SCAN_WARP_SCANS
= detail::to_BlockScanAlgorithm_enum(::rocprim::block_scan_algorithm::using_warp_scan)
};
template<
typename T,
int BLOCK_DIM_X,
BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockScan
: private ::rocprim::block_scan<
T,
BLOCK_DIM_X,
static_cast<::rocprim::block_scan_algorithm>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_scan<
T,
BLOCK_DIM_X,
static_cast<::rocprim::block_scan_algorithm>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockScan() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockScan(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
HIPCUB_DEVICE inline
void InclusiveSum(T input, T& output)
{
base_type::inclusive_scan(input, output, temp_storage_);
}
HIPCUB_DEVICE inline
void InclusiveSum(T input, T& output, T& block_aggregate)
{
base_type::inclusive_scan(input, output, block_aggregate, temp_storage_);
}
template<typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::inclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
);
}
template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
{
base_type::inclusive_scan(input, output, temp_storage_);
}
template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T& block_aggregate)
{
base_type::inclusive_scan(input, output, block_aggregate, temp_storage_);
}
template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::inclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
);
}
template<typename ScanOp>
HIPCUB_DEVICE inline
void InclusiveScan(T input, T& output, ScanOp scan_op)
{
base_type::inclusive_scan(input, output, temp_storage_, scan_op);
}
template<typename ScanOp>
HIPCUB_DEVICE inline
void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
{
base_type::inclusive_scan(input, output, block_aggregate, temp_storage_, scan_op);
}
template<typename ScanOp, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::inclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, scan_op
);
}
template<int ITEMS_PER_THREAD, typename ScanOp>
HIPCUB_DEVICE inline
void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD], ScanOp scan_op)
{
base_type::inclusive_scan(input, output, temp_storage_, scan_op);
}
template<int ITEMS_PER_THREAD, typename ScanOp>
HIPCUB_DEVICE inline
void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
ScanOp scan_op, T& block_aggregate)
{
base_type::inclusive_scan(input, output, block_aggregate, temp_storage_, scan_op);
}
template<int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void InclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::inclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, scan_op
);
}
HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output)
{
base_type::exclusive_scan(input, output, T(0), temp_storage_);
}
HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output, T& block_aggregate)
{
base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
}
template<typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::exclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
);
}
template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
{
base_type::exclusive_scan(input, output, T(0), temp_storage_);
}
template<int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T& block_aggregate)
{
base_type::exclusive_scan(input, output, T(0), block_aggregate, temp_storage_);
}
template<int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void ExclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::exclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, ::cub::Sum()
);
}
template<typename ScanOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
{
base_type::exclusive_scan(input, output, initial_value, temp_storage_, scan_op);
}
template<typename ScanOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T input, T& output, T initial_value,
ScanOp scan_op, T& block_aggregate)
{
base_type::exclusive_scan(
input, output, initial_value, block_aggregate, temp_storage_, scan_op
);
}
template<typename ScanOp, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T input, T& output, ScanOp scan_op,
BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::exclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, scan_op
);
}
template<int ITEMS_PER_THREAD, typename ScanOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T initial_value, ScanOp scan_op)
{
base_type::exclusive_scan(input, output, initial_value, temp_storage_, scan_op);
}
template<int ITEMS_PER_THREAD, typename ScanOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
T initial_value, ScanOp scan_op, T& block_aggregate)
{
base_type::exclusive_scan(
input, output, initial_value, block_aggregate, temp_storage_, scan_op
);
}
template<int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
HIPCUB_DEVICE inline
void ExclusiveScan(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
{
base_type::exclusive_scan(
input, output, temp_storage_, block_prefix_callback_op, scan_op
);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/block/block_shuffle.hpp>
BEGIN_HIPCUB_NAMESPACE
template <
typename T,
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH>
class BlockShuffle : public ::rocprim::block_shuffle<
T,
BLOCK_DIM_X,
BLOCK_DIM_Y,
BLOCK_DIM_Z>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_shuffle<
T,
BLOCK_DIM_X,
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockShuffle() : temp_storage_(private_storage())
{}
HIPCUB_DEVICE inline
BlockShuffle(TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
: temp_storage_(temp_storage)
{}
/**
* \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
*
* \par
* - \smemreuse
*/
HIPCUB_DEVICE inline void Offset(
T input, ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
T& output, ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input). This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
int distance = 1) ///< [in] Offset distance (may be negative)
{
base_type::offset(input,output,distance);
}
/**
* \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
*
* \par
* - \smemreuse
*/
HIPCUB_DEVICE inline void Rotate(
T input, ///< [in] The calling thread's input item
T& output, ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input). This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
{
base_type::rotate(input,output,distance);
}
/**
* \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
*
* \par
* - \blocked
* - \granularity
* - \smemreuse
*/
template <int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline void Up(
T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items
T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
{
base_type::up(input,prev);
}
/**
* \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item. All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
*
* \par
* - \blocked
* - \granularity
* - \smemreuse
*/
template <int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline void Up(
T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items
T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
{
base_type::up(input,prev,block_suffix);
}
/**
* \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
*
* \par
* - \blocked
* - \granularity
* - \smemreuse
*/
template <int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline void Down(
T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items
T (&next)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p next[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
{
base_type::down(input,next);
}
/**
* \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
*
* \par
* - \blocked
* - \granularity
* - \smemreuse
*/
template <int ITEMS_PER_THREAD>
HIPCUB_DEVICE inline void Down(
T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items
T (&next)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p next[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
T &block_prefix) ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
{
base_type::down(input,next,block_prefix);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "block_store_func.hpp"
#include <cub/rocprim/block/block_store.hpp>
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
inline constexpr
typename std::underlying_type<::rocprim::block_store_method>::type
to_BlockStoreAlgorithm_enum(::rocprim::block_store_method v)
{
using utype = std::underlying_type<::rocprim::block_store_method>::type;
return static_cast<utype>(v);
}
}
enum BlockStoreAlgorithm
{
BLOCK_STORE_DIRECT
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_direct),
BLOCK_STORE_STRIPED
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_striped),
BLOCK_STORE_VECTORIZE
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_vectorize),
BLOCK_STORE_TRANSPOSE
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_transpose),
BLOCK_STORE_WARP_TRANSPOSE
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_warp_transpose),
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
= detail::to_BlockStoreAlgorithm_enum(::rocprim::block_store_method::block_store_warp_transpose)
};
template<
typename T,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int ARCH = HIPCUB_ARCH /* ignored */
>
class BlockStore
: private ::rocprim::block_store<
T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
static_cast<::rocprim::block_store_method>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>
{
static_assert(
BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z > 0,
"BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z must be greater than 0"
);
using base_type =
typename ::rocprim::block_store<
T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
static_cast<::rocprim::block_store_method>(ALGORITHM),
BLOCK_DIM_Y,
BLOCK_DIM_Z
>;
// Reference to temporary storage (usually shared memory)
typename base_type::storage_type& temp_storage_;
public:
using TempStorage = typename base_type::storage_type;
HIPCUB_DEVICE inline
BlockStore() : temp_storage_(private_storage())
{
}
HIPCUB_DEVICE inline
BlockStore(TempStorage& temp_storage) : temp_storage_(temp_storage)
{
}
template<class OutputIteratorT>
HIPCUB_DEVICE inline
void Store(OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
base_type::store(block_iter, items, temp_storage_);
}
template<class OutputIteratorT>
HIPCUB_DEVICE inline
void Store(OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
base_type::store(block_iter, items, valid_items, temp_storage_);
}
private:
HIPCUB_DEVICE inline
TempStorage& private_storage()
{
HIPCUB_SHARED_MEMORY TempStorage private_storage;
return private_storage;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
#define HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
#include "../config.hpp"
#include <cub/rocprim/block/block_store_func.hpp>
BEGIN_HIPCUB_NAMESPACE
template<
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectBlocked(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_store_direct_blocked(
linear_id, block_iter, items
);
}
template<
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectBlocked(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
::rocprim::block_store_direct_blocked(
linear_id, block_iter, items, valid_items
);
}
template <
typename T,
int ITEMS_PER_THREAD
>
HIPCUB_DEVICE inline
void StoreDirectBlockedVectorized(int linear_id,
T* block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_store_direct_blocked_vectorized(
linear_id, block_iter, items
);
}
template<
int BLOCK_THREADS,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectStriped(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_store_direct_striped<BLOCK_THREADS>(
linear_id, block_iter, items
);
}
template<
int BLOCK_THREADS,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectStriped(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
::rocprim::block_store_direct_striped<BLOCK_THREADS>(
linear_id, block_iter, items, valid_items
);
}
template<
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectWarpStriped(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD])
{
::rocprim::block_store_direct_warp_striped(
linear_id, block_iter, items
);
}
template<
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorT
>
HIPCUB_DEVICE inline
void StoreDirectWarpStriped(int linear_id,
OutputIteratorT block_iter,
T (&items)[ITEMS_PER_THREAD],
int valid_items)
{
::rocprim::block_store_direct_warp_striped(
linear_id, block_iter, items, valid_items
);
}
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_BLOCK_BLOCK_STORE_FUNC_HPP_
/******************************************************************************
* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* radix_rank_sort_operations.cuh contains common abstractions, definitions and
* operations used for radix sorting and ranking.
*/
#ifndef HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
#define HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
#include <type_traits>
#include "../config.hpp"
#include <cub/rocprim/config.hpp>
#include <cub/rocprim/type_traits.hpp>
#include <cub/rocprim/detail/various.hpp>
BEGIN_HIPCUB_NAMESPACE
/** \brief Twiddling keys for radix sort. */
template <bool IS_DESCENDING, typename KeyT>
struct RadixSortTwiddle
{
typedef Traits<KeyT> TraitsT;
typedef typename TraitsT::UnsignedBits UnsignedBits;
static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits In(UnsignedBits key)
{
key = TraitsT::TwiddleIn(key);
if (IS_DESCENDING) key = ~key;
return key;
}
static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits Out(UnsignedBits key)
{
if (IS_DESCENDING) key = ~key;
key = TraitsT::TwiddleOut(key);
return key;
}
static HIPCUB_HOST_DEVICE __forceinline__ UnsignedBits DefaultKey()
{
return Out(~UnsignedBits(0));
}
};
/** \brief Base struct for digit extractor. Contains common code to provide
special handling for floating-point -0.0.
\note This handles correctly both the case when the keys are
bitwise-complemented after twiddling for descending sort (in onesweep) as
well as when the keys are not bit-negated, but the implementation handles
descending sort separately (in other implementations in CUB). Twiddling
alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are
subsequent bit patterns and bitwise complements of each other. For onesweep,
both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for
ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending
sort. For all other sorting implementations in CUB, both are always mapped
to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other
and only one of them is used, the sorting works correctly. For double, the
same applies, but with 64-bit patterns.
*/
template <typename KeyT>
struct BaseDigitExtractor
{
typedef Traits<KeyT> TraitsT;
typedef typename TraitsT::UnsignedBits UnsignedBits;
enum
{
FLOAT_KEY = TraitsT::CATEGORY == FLOATING_POINT,
};
static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
{
if (!FLOAT_KEY) {
return key;
} else {
UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
}
}
};
/** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a
* key from a digit. */
template <typename KeyT>
struct BFEDigitExtractor : BaseDigitExtractor<KeyT>
{
using typename BaseDigitExtractor<KeyT>::UnsignedBits;
uint32_t bit_start, num_bits;
explicit __device__ __forceinline__ BFEDigitExtractor(
uint32_t bit_start = 0, uint32_t num_bits = 0)
: bit_start(bit_start), num_bits(num_bits)
{ }
__device__ __forceinline__ uint32_t Digit(UnsignedBits key)
{
return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits);
}
};
/** \brief A wrapper type to extract digits. Uses a combination of shift and
* bitwise and to extract digits. */
template <typename KeyT>
struct ShiftDigitExtractor : BaseDigitExtractor<KeyT>
{
using typename BaseDigitExtractor<KeyT>::UnsignedBits;
uint32_t bit_start, mask;
explicit __device__ __forceinline__ ShiftDigitExtractor(
uint32_t bit_start = 0, uint32_t num_bits = 0)
: bit_start(bit_start), mask((1 << num_bits) - 1)
{ }
__device__ __forceinline__ uint32_t Digit(UnsignedBits key)
{
return uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask;
}
};
END_HIPCUB_NAMESPACE
#endif //HIPCUB_ROCPRIM_BLOCK_RADIX_RANK_SORT_OPERATIONS_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_CONFIG_HPP_
#define HIPCUB_CONFIG_HPP_
#include <cuda_runtime.h>
#define HIPCUB_NAMESPACE cub
#define BEGIN_HIPCUB_NAMESPACE \
namespace cub {
#define END_HIPCUB_NAMESPACE \
} /* hipcub */
#ifndef HIPCUB_ARCH
#define HIPCUB_ARCH 1
#endif
#define CUB_DEVICE_WARP_THREADS 64
#ifdef __CUDACC__
#define HIPCUB_ROCPRIM_API 1
#define HIPCUB_RUNTIME_FUNCTION __host__
#elif defined(__HIP_PLATFORM_NVIDIA__)
#define HIPCUB_CUB_API 1
#define HIPCUB_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION
#include <cub/util_arch.cuh>
#define HIPCUB_WARP_THREADS CUB_PTX_WARP_THREADS
#define HIPCUB_DEVICE_WARP_THREADS CUB_PTX_WARP_THREADS
#define HIPCUB_HOST_WARP_THREADS CUB_PTX_WARP_THREADS
#define HIPCUB_ARCH CUB_PTX_ARCH
BEGIN_HIPCUB_NAMESPACE
using namespace cub;
END_HIPCUB_NAMESPACE
#endif
/// Supported warp sizes
#define HIPCUB_WARP_SIZE_32 32u
#define HIPCUB_WARP_SIZE_64 64u
#define HIPCUB_MAX_WARP_SIZE HIPCUB_WARP_SIZE_64
#define HIPCUB_HOST __host__
#define HIPCUB_DEVICE __device__
#define HIPCUB_HOST_DEVICE __host__ __device__
#define HIPCUB_SHARED_MEMORY __shared__
// Helper macros to disable warnings in clang
#ifdef __clang__
#define HIPCUB_PRAGMA_TO_STR(x) _Pragma(#x)
#define HIPCUB_CLANG_SUPPRESS_WARNING_PUSH _Pragma("clang diagnostic push")
#define HIPCUB_CLANG_SUPPRESS_WARNING(w) HIPCUB_PRAGMA_TO_STR(clang diagnostic ignored w)
#define HIPCUB_CLANG_SUPPRESS_WARNING_POP _Pragma("clang diagnostic pop")
#define HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH(w) \
HIPCUB_CLANG_SUPPRESS_WARNING_PUSH HIPCUB_CLANG_SUPPRESS_WARNING(w)
#else // __clang__
#define HIPCUB_CLANG_SUPPRESS_WARNING_PUSH
#define HIPCUB_CLANG_SUPPRESS_WARNING(w)
#define HIPCUB_CLANG_SUPPRESS_WARNING_POP
#define HIPCUB_CLANG_SUPPRESS_WARNING_WITH_PUSH(w)
#endif // __clang__
BEGIN_HIPCUB_NAMESPACE
/// hipCUB error reporting macro (prints error messages to stderr)
#if (defined(DEBUG) || defined(_DEBUG)) && !defined(HIPCUB_STDERR)
#define HIPCUB_STDERR
#endif
inline
cudaError_t Debug(
cudaError_t error,
const char* filename,
int line)
{
(void)filename;
(void)line;
#ifdef HIPCUB_STDERR
if (error)
{
fprintf(stderr, "cuda error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
fflush(stderr);
}
#endif
return error;
}
#ifndef cubDebug
#define cubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_CONFIG_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_HIPCUB_HPP_
#define HIPCUB_ROCPRIM_HIPCUB_HPP_
#include "config.hpp"
#include "version.cuh"
#include "util_allocator.cuh"
#include "util_type.cuh"
#include "util_ptx.cuh"
#include "thread/thread_operators.cuh"
// Iterator
#include "iterator/arg_index_input_iterator.cuh"
#include "iterator/cache_modified_input_iterator.cuh"
#include "iterator/cache_modified_output_iterator.cuh"
#include "iterator/constant_input_iterator.cuh"
#include "iterator/counting_input_iterator.cuh"
#include "iterator/discard_output_iterator.cuh"
#include "iterator/tex_obj_input_iterator.cuh"
#include "iterator/tex_ref_input_iterator.cuh"
#include "iterator/transform_input_iterator.cuh"
// Warp
#include "warp/warp_exchange.hpp"
#include "warp/warp_load.hpp"
#include "warp/warp_merge_sort.hpp"
#include "warp/warp_reduce.cuh"
#include "warp/warp_scan.cuh"
#include "warp/warp_store.hpp"
// Thread
#include "thread/thread_load.cuh"
#include "thread/thread_operators.cuh"
#include "thread/thread_reduce.cuh"
#include "thread/thread_scan.cuh"
#include "thread/thread_search.cuh"
#include "thread/thread_sort.hpp"
#include "thread/thread_store.cuh"
// Block
#include "block/block_discontinuity.cuh"
#include "block/block_exchange.cuh"
#include "block/block_histogram.cuh"
#include "block/block_load.cuh"
#include "block/block_radix_sort.cuh"
#include "block/block_reduce.cuh"
#include "block/block_scan.cuh"
#include "block/block_store.cuh"
// Device
#include "device/device_adjacent_difference.hpp"
#include "device/device_histogram.cuh"
#include "device/device_radix_sort.cuh"
#include "device/device_reduce.cuh"
#include "device/device_run_length_encode.cuh"
#include "device/device_scan.cuh"
#include "device/device_segmented_radix_sort.cuh"
#include "device/device_segmented_reduce.cuh"
#include "device/device_segmented_sort.hpp"
#include "device/device_select.cuh"
#include "device/device_partition.cuh"
#endif // HIPCUB_ROCPRIM_HIPCUB_HPP_
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
#include "../config.hpp"
#include <cub/thread/thread_operators.cuh>
#include <cub/rocprim/device/device_adjacent_difference.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceAdjacentDifference
{
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT = ::cub::Difference>
static HIPCUB_RUNTIME_FUNCTION cudaError_t
SubtractLeftCopy(void *d_temp_storage,
std::size_t &temp_storage_bytes,
InputIteratorT d_input,
OutputIteratorT d_output,
std::size_t num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::adjacent_difference(
d_temp_storage, temp_storage_bytes, d_input, d_output,
num_items, difference_op, stream, debug_synchronous
);
}
template <typename RandomAccessIteratorT,
typename DifferenceOpT = ::cub::Difference>
static HIPCUB_RUNTIME_FUNCTION cudaError_t
SubtractLeft(void *d_temp_storage,
std::size_t &temp_storage_bytes,
RandomAccessIteratorT d_input,
std::size_t num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::adjacent_difference_inplace(
d_temp_storage, temp_storage_bytes, d_input,
num_items, difference_op, stream, debug_synchronous
);
}
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT = ::cub::Difference>
static HIPCUB_RUNTIME_FUNCTION cudaError_t
SubtractRightCopy(void *d_temp_storage,
std::size_t &temp_storage_bytes,
InputIteratorT d_input,
OutputIteratorT d_output,
std::size_t num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::adjacent_difference_right(
d_temp_storage, temp_storage_bytes, d_input, d_output,
num_items, difference_op, stream, debug_synchronous
);
}
template <typename RandomAccessIteratorT,
typename DifferenceOpT = ::cub::Difference>
static HIPCUB_RUNTIME_FUNCTION cudaError_t
SubtractRight(void *d_temp_storage,
std::size_t &temp_storage_bytes,
RandomAccessIteratorT d_input,
std::size_t num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::adjacent_difference_right_inplace(
d_temp_storage, temp_storage_bytes, d_input,
num_items, difference_op, stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_histogram.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceHistogram
{
template<
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t HistogramEven(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram,
int num_levels,
LevelT lower_level,
LevelT upper_level,
OffsetT num_samples,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::histogram_even(
d_temp_storage, temp_storage_bytes,
d_samples, num_samples,
d_histogram,
num_levels, lower_level, upper_level,
stream, debug_synchronous
);
}
template<
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t HistogramEven(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram,
int num_levels,
LevelT lower_level,
LevelT upper_level,
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::histogram_even(
d_temp_storage, temp_storage_bytes,
d_samples, num_row_samples, num_rows, row_stride_bytes,
d_histogram,
num_levels, lower_level, upper_level,
stream, debug_synchronous
);
}
template<
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t MultiHistogramEven(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT lower_level[NUM_ACTIVE_CHANNELS],
LevelT upper_level[NUM_ACTIVE_CHANNELS],
OffsetT num_pixels,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
unsigned int levels[NUM_ACTIVE_CHANNELS];
for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
{
levels[channel] = num_levels[channel];
}
return (cudaError_t)::rocprim::multi_histogram_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage, temp_storage_bytes,
d_samples, num_pixels,
d_histogram,
levels, lower_level, upper_level,
stream, debug_synchronous
);
}
template<
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t MultiHistogramEven(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT lower_level[NUM_ACTIVE_CHANNELS],
LevelT upper_level[NUM_ACTIVE_CHANNELS],
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
unsigned int levels[NUM_ACTIVE_CHANNELS];
for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
{
levels[channel] = num_levels[channel];
}
return (cudaError_t)::rocprim::multi_histogram_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage, temp_storage_bytes,
d_samples, num_row_pixels, num_rows, row_stride_bytes,
d_histogram,
levels, lower_level, upper_level,
stream, debug_synchronous
);
}
template<
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t HistogramRange(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram,
int num_levels,
LevelT * d_levels,
OffsetT num_samples,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::histogram_range(
d_temp_storage, temp_storage_bytes,
d_samples, num_samples,
d_histogram,
num_levels, d_levels,
stream, debug_synchronous
);
}
template<
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t HistogramRange(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram,
int num_levels,
LevelT * d_levels,
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::histogram_range(
d_temp_storage, temp_storage_bytes,
d_samples, num_row_samples, num_rows, row_stride_bytes,
d_histogram,
num_levels, d_levels,
stream, debug_synchronous
);
}
template<
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t MultiHistogramRange(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT * d_levels[NUM_ACTIVE_CHANNELS],
OffsetT num_pixels,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
unsigned int levels[NUM_ACTIVE_CHANNELS];
for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
{
levels[channel] = num_levels[channel];
}
return (cudaError_t)::rocprim::multi_histogram_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage, temp_storage_bytes,
d_samples, num_pixels,
d_histogram,
levels, d_levels,
stream, debug_synchronous
);
}
template<
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t MultiHistogramRange(void * d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples,
CounterT * d_histogram[NUM_ACTIVE_CHANNELS],
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT * d_levels[NUM_ACTIVE_CHANNELS],
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
unsigned int levels[NUM_ACTIVE_CHANNELS];
for(unsigned int channel = 0; channel < NUM_ACTIVE_CHANNELS; channel++)
{
levels[channel] = num_levels[channel];
}
return (cudaError_t)::rocprim::multi_histogram_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage, temp_storage_bytes,
d_samples, num_row_pixels, num_rows, row_stride_bytes,
d_histogram,
levels, d_levels,
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_HISTOGRAM_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_merge_sort.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceMergeSort
{
template<typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyIteratorT d_keys,
ValueIteratorT d_items,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
temp_storage_bytes,
d_keys,
d_keys,
d_items,
d_items,
num_items,
compare_op,
stream,
debug_synchronous);
}
template<typename KeyInputIteratorT,
typename ValueInputIteratorT,
typename KeyIteratorT,
typename ValueIteratorT,
typename OffsetT,
typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyInputIteratorT d_input_keys,
ValueInputIteratorT d_input_items,
KeyIteratorT d_output_keys,
ValueIteratorT d_output_items,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
temp_storage_bytes,
d_input_keys,
d_output_keys,
d_input_items,
d_output_items,
num_items,
compare_op,
stream,
debug_synchronous);
}
template<typename KeyIteratorT, typename OffsetT, typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyIteratorT d_keys,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(
d_temp_storage, temp_storage_bytes,
d_keys, d_keys, num_items,
compare_op, stream, debug_synchronous
);
}
template<typename KeyInputIteratorT,
typename KeyIteratorT,
typename OffsetT,
typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyInputIteratorT d_input_keys,
KeyIteratorT d_output_keys,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(
d_temp_storage, temp_storage_bytes,
d_input_keys, d_output_keys, num_items,
compare_op, stream, debug_synchronous
);
}
template <typename KeyIteratorT,
typename ValueIteratorT,
typename OffsetT,
typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t
StableSortPairs(void *d_temp_storage,
std::size_t &temp_storage_bytes,
KeyIteratorT d_keys,
ValueIteratorT d_items,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(d_temp_storage,
temp_storage_bytes,
d_keys,
d_keys,
d_items,
d_items,
num_items,
compare_op,
stream,
debug_synchronous);
}
template<typename KeyIteratorT, typename OffsetT, typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void * d_temp_storage,
std::size_t & temp_storage_bytes,
KeyIteratorT d_keys,
OffsetT num_items,
CompareOpT compare_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::merge_sort(
d_temp_storage, temp_storage_bytes,
d_keys, d_keys, num_items,
compare_op, stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_MERGE_SORT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_PARTITION_HPP_
#define HIPCUB_ROCPRIM_DEVICE_PARTITION_HPP_
#include "../config.hpp"
#include <cub/rocprim/device/device_partition.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DevicePartition
{
template <
typename InputIteratorT,
typename FlagIterator,
typename OutputIteratorT,
typename NumSelectedIteratorT>
HIPCUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Flagged(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> hip stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return (cudaError_t)rocprim::partition(
d_temp_storage,
temp_storage_bytes,
d_in,
d_flags,
d_out,
d_num_selected_out,
num_items,
stream,
debug_synchronous);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename SelectOp>
HIPCUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t If(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items
NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
int num_items, ///< [in] Total number of items to select from
SelectOp select_op, ///< [in] Unary selection operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> hip stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return (cudaError_t)rocprim::partition(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
d_num_selected_out,
num_items,
select_op,
stream,
debug_synchronous);
}
template <typename InputIteratorT,
typename FirstOutputIteratorT,
typename SecondOutputIteratorT,
typename UnselectedOutputIteratorT,
typename NumSelectedIteratorT,
typename SelectFirstPartOp,
typename SelectSecondPartOp>
HIPCUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
If(void *d_temp_storage,
std::size_t &temp_storage_bytes,
InputIteratorT d_in,
FirstOutputIteratorT d_first_part_out,
SecondOutputIteratorT d_second_part_out,
UnselectedOutputIteratorT d_unselected_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
SelectFirstPartOp select_first_part_op,
SelectSecondPartOp select_second_part_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)rocprim::partition_three_way(
d_temp_storage,
temp_storage_bytes,
d_in,
d_first_part_out,
d_second_part_out,
d_unselected_out,
d_num_selected_out,
num_items,
select_first_part_op,
select_second_part_op,
stream,
debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_radix_sort.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceRadixSort
{
template<typename KeyT, typename ValueT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename ValueT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
template<typename KeyT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
NumItemsT num_items,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_RADIX_SORT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
#include <limits>
#include <iterator>
#include <cuda_fp16.h> // __half
#include <thrust/system/cuda/cuda_bfloat16.h> // hip_bfloat16
#include "../config.hpp"
#include "../iterator/arg_index_input_iterator.cuh"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/device/device_reduce.hpp>
#include <cub/rocprim/device/device_reduce_by_key.hpp>
BEGIN_HIPCUB_NAMESPACE
namespace detail
{
template<class T>
inline
T get_lowest_value()
{
return std::numeric_limits<T>::lowest();
}
template<>
inline
__half get_lowest_value<__half>()
{
unsigned short lowest_half = 0xfbff;
__half lowest_value = *reinterpret_cast<__half*>(&lowest_half);
return lowest_value;
}
template<>
inline
cuda_bfloat16 get_lowest_value<cuda_bfloat16>()
{
return cuda_bfloat16(-3.38953138925e+38f);
}
template<class T>
inline
T get_max_value()
{
return std::numeric_limits<T>::max();
}
template<>
inline
__half get_max_value<__half>()
{
unsigned short max_half = 0x7bff;
__half max_value = *reinterpret_cast<__half*>(&max_half);
return max_value;
}
template<>
inline
cuda_bfloat16 get_max_value<cuda_bfloat16>()
{
return cuda_bfloat16(3.38953138925e+38f);
}
} // end detail namespace
class DeviceReduce
{
public:
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ReduceOpT,
typename T
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Reduce(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
ReduceOpT reduction_op,
T init,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out, init, num_items,
::cub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Sum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using T = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, ::cub::Sum(), T(0),
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Min(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using T = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, ::cub::Min(), detail::get_max_value<T>(),
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ArgMin(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using OffsetT = int;
using T = typename std::iterator_traits<InputIteratorT>::value_type;
using O = typename std::iterator_traits<OutputIteratorT>::value_type;
using OutputTupleT =
typename std::conditional<
std::is_same<O, void>::value,
KeyValuePair<OffsetT, T>,
O
>::type;
using OutputValueT = typename OutputTupleT::Value;
using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
IteratorT d_indexed_in(d_in);
OutputTupleT init(1, detail::get_max_value<T>());
return Reduce(
d_temp_storage, temp_storage_bytes,
d_indexed_in, d_out, num_items, ::cub::ArgMin(), init,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Max(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using T = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items, ::cub::Max(), detail::get_lowest_value<T>(),
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ArgMax(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using OffsetT = int;
using T = typename std::iterator_traits<InputIteratorT>::value_type;
using O = typename std::iterator_traits<OutputIteratorT>::value_type;
using OutputTupleT =
typename std::conditional<
std::is_same<O, void>::value,
KeyValuePair<OffsetT, T>,
O
>::type;
using OutputValueT = typename OutputTupleT::Value;
using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
IteratorT d_indexed_in(d_in);
OutputTupleT init(1, detail::get_lowest_value<T>());
return Reduce(
d_temp_storage, temp_storage_bytes,
d_indexed_in, d_out, num_items, ::cub::ArgMax(), init,
stream, debug_synchronous
);
}
template<
typename KeysInputIteratorT,
typename UniqueOutputIteratorT,
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename ReductionOpT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ReduceByKey(void * d_temp_storage,
size_t& temp_storage_bytes,
KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
ReductionOpT reduction_op,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using key_compare_op =
::rocprim::equal_to<typename std::iterator_traits<KeysInputIteratorT>::value_type>;
return (cudaError_t)::rocprim::reduce_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, num_items,
d_unique_out, d_aggregates_out, d_num_runs_out,
::cub::detail::convert_result_type<ValuesInputIteratorT, AggregatesOutputIteratorT>(reduction_op),
key_compare_op(),
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_REDUCE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
#include "../config.hpp"
#include <cub/rocprim/device/device_run_length_encode.hpp>
BEGIN_HIPCUB_NAMESPACE
class DeviceRunLengthEncode
{
public:
template<
typename InputIteratorT,
typename UniqueOutputIteratorT,
typename LengthsOutputIteratorT,
typename NumRunsOutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Encode(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
UniqueOutputIteratorT d_unique_out,
LengthsOutputIteratorT d_counts_out,
NumRunsOutputIteratorT d_num_runs_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::run_length_encode(
d_temp_storage, temp_storage_bytes,
d_in, num_items,
d_unique_out, d_counts_out, d_num_runs_out,
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OffsetsOutputIteratorT,
typename LengthsOutputIteratorT,
typename NumRunsOutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t NonTrivialRuns(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OffsetsOutputIteratorT d_offsets_out,
LengthsOutputIteratorT d_lengths_out,
NumRunsOutputIteratorT d_num_runs_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::run_length_encode_non_trivial_runs(
d_temp_storage, temp_storage_bytes,
d_in, num_items,
d_offsets_out, d_lengths_out, d_num_runs_out,
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_RUN_LENGTH_ENCODE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
#include <iostream>
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/device/device_scan.hpp>
#include <cub/rocprim/device/device_scan_by_key.hpp>
BEGIN_HIPCUB_NAMESPACE
class DeviceScan
{
public:
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t InclusiveSum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
size_t num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return InclusiveScan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, ::cub::Sum(), num_items,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t InclusiveScan(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
size_t num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::inclusive_scan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, num_items,
scan_op,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveSum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
size_t num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using T = typename std::iterator_traits<InputIteratorT>::value_type;
return ExclusiveScan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, ::cub::Sum(), T(0), num_items,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT,
typename InitValueT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveScan(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
InitValueT init_value,
size_t num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::exclusive_scan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, init_value, num_items,
scan_op,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT,
typename InitValueT,
typename InitValueIterT = InitValueT*
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveScan(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
ScanOpT scan_op,
FutureValue<InitValueT, InitValueIterT> init_value,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::exclusive_scan(
d_temp_storage, temp_storage_bytes,
d_in, d_out, init_value, num_items,
scan_op,
stream, debug_synchronous
);
}
template <
typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename EqualityOpT = ::cub::Equality
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveSumByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
int num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using in_value_type = typename std::iterator_traits<ValuesInputIteratorT>::value_type;
return (cudaError_t)::rocprim::exclusive_scan_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, d_values_out,
static_cast<in_value_type>(0), static_cast<size_t>(num_items),
::cub::Sum(), equality_op, stream, debug_synchronous
);
}
template <
typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename ScanOpT,
typename InitValueT,
typename EqualityOpT = ::cub::Equality
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ExclusiveScanByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
ScanOpT scan_op,
InitValueT init_value,
int num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::exclusive_scan_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, d_values_out,
init_value, static_cast<size_t>(num_items),
scan_op, equality_op, stream, debug_synchronous
);
}
template <
typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename EqualityOpT = ::cub::Equality
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t InclusiveSumByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
int num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::inclusive_scan_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, d_values_out,
static_cast<size_t>(num_items), ::cub::Sum(),
equality_op, stream, debug_synchronous
);
}
template <
typename KeysInputIteratorT,
typename ValuesInputIteratorT,
typename ValuesOutputIteratorT,
typename ScanOpT,
typename EqualityOpT = ::cub::Equality
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t InclusiveScanByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
ScanOpT scan_op,
int num_items,
EqualityOpT equality_op = EqualityOpT(),
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::inclusive_scan_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_values_in, d_values_out,
static_cast<size_t>(num_items), scan_op,
equality_op, stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_segmented_radix_sort.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceSegmentedRadixSort
{
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
int begin_bit = 0,
int end_bit = sizeof(KeyT) * 8,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
begin_bit, end_bit,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
#include <limits>
#include <iterator>
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include "../iterator/arg_index_input_iterator.cuh"
#include <cub/rocprim/device/device_segmented_reduce.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceSegmentedReduce
{
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT,
typename ReductionOp,
typename T
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Reduce(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
ReductionOp reduction_op,
T initial_value,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::detail::convert_result_type<InputIteratorT, OutputIteratorT>(reduction_op),
initial_value,
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Sum(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::Sum(), input_type(),
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Min(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::Min(), std::numeric_limits<input_type>::max(),
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ArgMin(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using OffsetT = int;
using T = typename std::iterator_traits<InputIteratorT>::value_type;
using O = typename std::iterator_traits<OutputIteratorT>::value_type;
using OutputTupleT = typename std::conditional<
std::is_same<O, void>::value,
KeyValuePair<OffsetT, T>,
O
>::type;
using OutputValueT = typename OutputTupleT::Value;
using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
IteratorT d_indexed_in(d_in);
const OutputTupleT init(1, std::numeric_limits<T>::max());
return Reduce(
d_temp_storage, temp_storage_bytes,
d_indexed_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::ArgMin(), init,
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Max(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using input_type = typename std::iterator_traits<InputIteratorT>::value_type;
return Reduce(
d_temp_storage, temp_storage_bytes,
d_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::Max(), std::numeric_limits<input_type>::lowest(),
stream, debug_synchronous
);
}
template<
typename InputIteratorT,
typename OutputIteratorT,
typename OffsetIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t ArgMax(void * d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
using OffsetT = int;
using T = typename std::iterator_traits<InputIteratorT>::value_type;
using O = typename std::iterator_traits<OutputIteratorT>::value_type;
using OutputTupleT = typename std::conditional<
std::is_same<O, void>::value,
KeyValuePair<OffsetT, T>,
O
>::type;
using OutputValueT = typename OutputTupleT::Value;
using IteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
IteratorT d_indexed_in(d_in);
const OutputTupleT init(1, std::numeric_limits<T>::lowest());
return Reduce(
d_temp_storage, temp_storage_bytes,
d_indexed_in, d_out,
num_segments, d_begin_offsets, d_end_offsets,
::cub::ArgMax(), init,
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
#include "../config.hpp"
#include "../util_type.cuh"
#include <cub/rocprim/device/device_segmented_radix_sort.hpp>
BEGIN_HIPCUB_NAMESPACE
struct DeviceSegmentedSort
{
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
::rocprim::double_buffer<ValueT> d_values_db = detail::to_double_buffer(d_values);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_pairs_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, d_values_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
detail::update_double_buffer(d_values, d_values_db);
return error;
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t SortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
::rocprim::double_buffer<KeyT> d_keys_db = detail::to_double_buffer(d_keys);
cudaError_t error = (cudaError_t)::rocprim::segmented_radix_sort_keys_desc(
d_temp_storage, temp_storage_bytes,
d_keys_db, num_items,
num_segments, d_begin_offsets, d_end_offsets,
0, sizeof(KeyT) * 8,
stream, debug_synchronous
);
detail::update_double_buffer(d_keys, d_keys_db);
return error;
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortPairs(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortPairs(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortPairs(
d_temp_storage, temp_storage_bytes,
d_keys, d_values, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
const ValueT * d_values_in,
ValueT * d_values_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortPairsDescending(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, d_values_in, d_values_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename ValueT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortPairsDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
DoubleBuffer<ValueT>& d_values,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortPairsDescending(
d_temp_storage, temp_storage_bytes,
d_keys, d_values, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortKeys(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortKeys(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortKeys(
d_temp_storage, temp_storage_bytes,
d_keys, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
const KeyT * d_keys_in,
KeyT * d_keys_out,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortKeysDescending(
d_temp_storage, temp_storage_bytes,
d_keys_in, d_keys_out, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
template<typename KeyT, typename OffsetIteratorT>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t StableSortKeysDescending(void * d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT>& d_keys,
int num_items,
int num_segments,
OffsetIteratorT d_begin_offsets,
OffsetIteratorT d_end_offsets,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return SortKeysDescending(
d_temp_storage, temp_storage_bytes,
d_keys, num_items,
num_segments, d_begin_offsets, d_end_offsets,
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SEGMENTED_SORT_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
#include "../config.hpp"
#include "../thread/thread_operators.cuh"
#include <cub/rocprim/device/device_select.hpp>
BEGIN_HIPCUB_NAMESPACE
class DeviceSelect
{
public:
template <
typename InputIteratorT,
typename FlagIterator,
typename OutputIteratorT,
typename NumSelectedIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Flagged(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
FlagIterator d_flags,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::select(
d_temp_storage, temp_storage_bytes,
d_in, d_flags, d_out, d_num_selected_out, num_items,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT,
typename SelectOp
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t If(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
SelectOp select_op,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::select(
d_temp_storage, temp_storage_bytes,
d_in, d_out, d_num_selected_out, num_items, select_op,
stream, debug_synchronous
);
}
template <
typename InputIteratorT,
typename OutputIteratorT,
typename NumSelectedIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t Unique(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::unique(
d_temp_storage, temp_storage_bytes,
d_in, d_out, d_num_selected_out, num_items, cub::Equality(),
stream, debug_synchronous
);
}
template <
typename KeyIteratorT,
typename ValueIteratorT,
typename OutputKeyIteratorT,
typename OutputValueIteratorT,
typename NumSelectedIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
cudaError_t UniqueByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeyIteratorT d_keys_input,
ValueIteratorT d_values_input,
OutputKeyIteratorT d_keys_output,
OutputValueIteratorT d_values_output,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream = 0,
bool debug_synchronous = false)
{
return (cudaError_t)::rocprim::unique_by_key(
d_temp_storage, temp_storage_bytes,
d_keys_input, d_values_input,
d_keys_output, d_values_output,
d_num_selected_out, num_items, cub::Equality(),
stream, debug_synchronous
);
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_SELECT_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment