* \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
* \ingroup BlockModule
*
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam RADIX_BITS The number of radix bits per digit place
* \tparam IS_DESCENDING Whether or not the sorted-order is high-to-low
* \tparam MEMOIZE_OUTER_SCAN <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
* \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
* \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
*/
template<
typenameUnsignedBits,
intKEYS_PER_THREAD,
typenameDigitExtractorT>
HIPCUB_DEVICEinlinevoidRankKeys(
UnsignedBits(&keys)[KEYS_PER_THREAD],///< [in] Keys for this tile
int(&ranks)[KEYS_PER_THREAD],///< [out] For each key, the local rank within the tile (out parameter)
DigitExtractorTdigit_extractor,///< [in] The digit extractor
int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
{
// Rank keys
RankKeys(keys,ranks,digit_extractor);
// Get the inclusive and exclusive digit totals corresponding to the calling thread.
// Seed ranks with counter values from previous warps
#pragma unroll
for(intITEM=0;ITEM<KEYS_PER_THREAD;++ITEM)
ranks[ITEM]+=*digit_counters[ITEM];
}
/**
* \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
*/
template<
typenameUnsignedBits,
intKEYS_PER_THREAD,
typenameDigitExtractorT>
__device____forceinline__voidRankKeys(
UnsignedBits(&keys)[KEYS_PER_THREAD],///< [in] Keys for this tile
int(&ranks)[KEYS_PER_THREAD],///< [out] For each key, the local rank within the tile (out parameter)
DigitExtractorTdigit_extractor,///< [in] The digit extractor
int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
/// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)