typenameOffsetT>///< Signed integer type for sequence offsets
structSpmvParams
{
ValueT*d_values;///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
OffsetT*d_row_end_offsets;///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
OffsetT*d_column_indices;///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
ValueT*d_vector_x;///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
ValueT*d_vector_y;///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
intnum_rows;///< Number of rows of matrix <b>A</b>.
intnum_cols;///< Number of columns of matrix <b>A</b>.
intnum_nonzeros;///< Number of nonzero elements of matrix <b>A</b>.
void*d_temp_storage,///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t&temp_storage_bytes,///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
ValueT*d_values,///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
int*d_row_offsets,///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
int*d_column_indices,///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
ValueT*d_vector_x,///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
ValueT*d_vector_y,///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
intnum_rows,///< [in] number of rows of matrix <b>A</b>.
intnum_cols,///< [in] number of columns of matrix <b>A</b>.
intnum_nonzeros,///< [in] number of nonzero elements of matrix <b>A</b>.
cudaStream_tstream=0,///< [in] <b>[optional]</b> hip stream to launch kernels within. Default is stream<sub>0</sub>.
booldebug_synchronous=false)///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
SpmvParams<ValueT,int>spmv_params;
spmv_params.d_values=d_values;
spmv_params.d_row_end_offsets=d_row_offsets+1;
spmv_params.d_column_indices=d_column_indices;
spmv_params.d_vector_x=d_vector_x;
spmv_params.d_vector_y=d_vector_y;
spmv_params.num_rows=num_rows;
spmv_params.num_cols=num_cols;
spmv_params.num_nonzeros=num_nonzeros;
spmv_params.alpha=1.0;
spmv_params.beta=0.0;
cudaError_tstatus;
if(d_temp_storage==nullptr)
{
// Make sure user won't try to allocate 0 bytes memory, because
// hipMalloc will return nullptr when size is zero.
* \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
*/
enumGridMappingStrategy
{
/**
* \brief An a "raking" access pattern in which each thread block is
* assigned a consecutive sequence of input tiles
*
* \par Overview
* The input is evenly partitioned into \p p segments, where \p p is
* constant and corresponds loosely to the number of thread blocks that may
* actively reside on the target device. Each segment is comprised of
* consecutive tiles, where a tile is a small, constant-sized unit of input
* to be processed to completion before the thread block terminates or
* obtains more work. The kernel invokes \p p thread blocks, each
* of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
* in tile-size increments.
*/
GRID_MAPPING_RAKE,
/**
* \brief An a "strip mining" access pattern in which the input tiles assigned
* to each thread block are separated by a stride equal to the the extent of
* the grid.
*
* \par Overview
* The input is evenly partitioned into \p p sets, where \p p is
* constant and corresponds loosely to the number of thread blocks that may
* actively reside on the target device. Each set is comprised of
* data tiles separated by stride \p tiles, where a tile is a small,
* constant-sized unit of input to be processed to completion before the
* thread block terminates or obtains more work. The kernel invokes \p p
* thread blocks, each of which iteratively consumes a segment of
* <em>n</em>/<em>p</em> elements in tile-size increments.
*/
GRID_MAPPING_STRIP_MINE,
/**
* \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
*
* \par Overview
* The input is treated as a queue to be dynamically consumed by a grid of
* thread blocks. Work is atomically dequeued in tiles, where a tile is a
* unit of input to be processed to completion before the thread block
* terminates or obtains more work. The grid size \p p is constant,
* loosely corresponding to the number of thread blocks that may actively
* \brief GridQueue is a descriptor utility for dynamic queue management.
*
* \par Overview
* GridQueue descriptors provides abstractions for "filling" or
* "draining" globally-shared vectors.
*
* \par
* A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
* returning a unique offset for the calling thread to write its items.
* The GridQueue maintains the total "fill-size". The fill counter must be reset
* using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
* will be filling.
*
* \par
* Similarly, a "draining" GridQueue works by works by atomically-incrementing a
* zero-initialized counter, returning a unique offset for the calling thread to
* read its items. Threads can safely drain until the array's logical fill-size is
* exceeded. The drain counter must be reset using GridQueue::ResetDrain or
* GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
* will be filling. (For dynamic work distribution of existing data, the corresponding fill-size
* is simply the number of elements in the array.)
*
* \par
* Iterative work management can be implemented simply with a pair of flip-flopping
* work buffers, each with an associated set of fill and drain GridQueue descriptors.
*
* \tparam OffsetT Signed integer type for global offsets
*/
template<typenameOffsetT>
classGridQueue
{
private:
/// Counter indices
enum
{
FILL=0,
DRAIN=1,
};
/// Pair of counters
OffsetT*d_counters;
public:
/// Returns the device allocation size in bytes needed to construct a GridQueue instance
__host____device____forceinline__
staticsize_tAllocationSize()
{
returnsizeof(OffsetT)*2;
}
/// Constructs an invalid GridQueue descriptor
__host____device____forceinline__GridQueue()
:
d_counters(NULL)
{}
/// Constructs a GridQueue descriptor around the device storage allocation
__host____device____forceinline__GridQueue(
void*d_storage)///< Device allocation to back the GridQueue. Must be at least as big as <tt>AllocationSize()</tt>.
:
d_counters((OffsetT*)d_storage)
{}
/// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining.
/// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining.