Commit 0211193c authored by zhuwenwen's avatar zhuwenwen
Browse files

initial llama

parents
Pipeline #509 failed with stages
in 0 seconds
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_
#define HIPCUB_ROCPRIM_DEVICE_DEVICE_SPMV_HPP_
#include "../config.hpp"
#include "../iterator/tex_ref_input_iterator.cuh"
BEGIN_HIPCUB_NAMESPACE
class DeviceSpmv
{
public:
template <
typename ValueT, ///< Matrix and vector value type
typename OffsetT> ///< Signed integer type for sequence offsets
struct SpmvParams
{
ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
int num_rows; ///< Number of rows of matrix <b>A</b>.
int num_cols; ///< Number of columns of matrix <b>A</b>.
int num_nonzeros; ///< Number of nonzero elements of matrix <b>A</b>.
ValueT alpha; ///< Alpha multiplicand
ValueT beta; ///< Beta addend-multiplicand
::cub::TexRefInputIterator<ValueT, 66778899, OffsetT> t_vector_x;
};
static constexpr uint32_t CsrMVKernel_MaxThreads = 256;
template <typename ValueT>
static __global__ void
CsrMVKernel(SpmvParams<ValueT, int> spmv_params)
{
__shared__ ValueT partial;
const int32_t row_id = hipBlockIdx_x;
if(threadIdx.x == 0)
{
partial = spmv_params.beta * spmv_params.d_vector_y[row_id];
}
__syncthreads();
int32_t row_offset = (row_id == 0) ? (0) : (spmv_params.d_row_end_offsets[row_id - 1]);
for(uint32_t thread_offset = 0; thread_offset < spmv_params.num_cols / blockDim.x; thread_offset++)
{
int32_t offset = row_offset + thread_offset * blockDim.x + threadIdx.x;
if(offset < spmv_params.d_row_end_offsets[row_id])
{
ValueT t_value =
spmv_params.alpha *
spmv_params.d_values[offset] *
spmv_params.d_vector_x[spmv_params.d_column_indices[offset]];
atomicAdd(&partial, t_value);
__syncthreads();
iif(threadIdx.x == 0)
{
spmv_params.d_vector_y[row_id] = partial;
}
}
}
}
template <typename ValueT>
HIPCUB_RUNTIME_FUNCTION
static cudaError_t CsrMV(
void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
int num_rows, ///< [in] number of rows of matrix <b>A</b>.
int num_cols, ///< [in] number of columns of matrix <b>A</b>.
int num_nonzeros, ///< [in] number of nonzero elements of matrix <b>A</b>.
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> hip stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
SpmvParams<ValueT, int> spmv_params;
spmv_params.d_values = d_values;
spmv_params.d_row_end_offsets = d_row_offsets + 1;
spmv_params.d_column_indices = d_column_indices;
spmv_params.d_vector_x = d_vector_x;
spmv_params.d_vector_y = d_vector_y;
spmv_params.num_rows = num_rows;
spmv_params.num_cols = num_cols;
spmv_params.num_nonzeros = num_nonzeros;
spmv_params.alpha = 1.0;
spmv_params.beta = 0.0;
cudaError_t status;
if(d_temp_storage == nullptr)
{
// Make sure user won't try to allocate 0 bytes memory, because
// hipMalloc will return nullptr when size is zero.
temp_storage_bytes = 4;
return cudaError_t(0);
}
else
{
size_t block_size = min(num_cols, DeviceSpmv::CsrMVKernel_MaxThreads);
size_t grid_size = num_rows;
CsrMVKernel<<<grid_size, block_size, 0, stream>>>(spmv_params);
status = hipGetLastError();
}
return status;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_CUB_DEVICE_DEVICE_SELECT_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
#define HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../thread/thread_load.cuh"
BEGIN_HIPCUB_NAMESPACE
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridBarrier implements a software global barrier among thread blocks within a cuda grid
*/
class GridBarrier
{
protected :
typedef unsigned int SyncFlag;
// Counters in global device memory
SyncFlag* d_sync;
public:
/**
* Constructor
*/
GridBarrier() : d_sync(NULL) {}
/**
* @typedef SyncFlag
* @brief Synchronize
*/
__device__ __forceinline__ void Sync() const
{
volatile SyncFlag *d_vol_sync = d_sync;
// Threadfence and syncthreads to make sure global writes are visible before
// thread-0 reports in with its sync counter
__threadfence();
__syncthreads();
if (blockIdx.x == 0)
{
// Report in ourselves
if (threadIdx.x == 0)
{
d_vol_sync[blockIdx.x] = 1;
}
__syncthreads();
// Wait for everyone else to report in
for (uint32_t peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
{
while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
{
__threadfence_block();
}
}
__syncthreads();
// Let everyone know it's safe to proceed
for (uint32_t peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
{
d_vol_sync[peer_block] = 0;
}
}
else
{
if (threadIdx.x == 0)
{
// Report in
d_vol_sync[blockIdx.x] = 1;
// Wait for acknowledgment
while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
{
__threadfence_block();
}
}
__syncthreads();
}
}
};
/**
* \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
*
* Uses RAII for lifetime, i.e., device resources are reclaimed when
* the destructor is called.
*/
class GridBarrierLifetime : public GridBarrier
{
protected:
// Number of bytes backed by d_sync
size_t sync_bytes;
public:
/**
* Constructor
*/
GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
/**
* DeviceFrees and resets the progress counters
*/
cudaError_t HostReset()
{
cudaError_t retval = cudaSuccess;
if (d_sync)
{
retval = cudaFree(d_sync);
d_sync = NULL;
}
sync_bytes = 0;
return retval;
}
/**
* Destructor
*/
virtual ~GridBarrierLifetime()
{
HostReset();
}
/**
* Sets up the progress counters for the next kernel launch (lazily
* allocating and initializing them if necessary)
*/
cudaError_t Setup(int sweep_grid_size)
{
cudaError_t retval = cudaSuccess;
do {
size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
if (new_sync_bytes > sync_bytes)
{
if (d_sync)
{
if ((retval = cudaFree(d_sync))) break;
}
sync_bytes = new_sync_bytes;
// Allocate and initialize to zero
if ((retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
if ((retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
}
} while (0);
return retval;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_GRID_GRID_BARRIER_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
#define HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
#include <type_traits>
#include "../config.hpp"
#include "grid_mapping.cuh"
#include "../util_type.cuh"
BEGIN_HIPCUB_NAMESPACE
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridEvenShare is a descriptor utility for distributing input among
* CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly
* the same number of input tiles.
*
* \par Overview
* Each thread block is assigned a consecutive sequence of input tiles. To help
* preserve alignment and eliminate the overhead of guarded loads for all but the
* last thread block, to GridEvenShare assigns one of three different amounts of
* work to a given thread block: "big", "normal", or "last". The "big" workloads
* are one scheduling grain larger than "normal". The "last" work unit for the
* last thread block may be partially-full if the input is not an even multiple of
* the scheduling grain size.
*
* \par
* Before invoking a child grid, a parent thread will typically construct an
* instance of GridEvenShare. The instance can be passed to child thread blocks
* which can initialize their per-thread block offsets using \p BlockInit().
*/
template <typename OffsetT>
struct GridEvenShare
{
private:
int total_tiles;
int big_shares;
OffsetT big_share_items;
OffsetT normal_share_items;
OffsetT normal_base_offset;
public:
/// Total number of input items
OffsetT num_items;
/// Grid size in thread blocks
int grid_size;
/// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
OffsetT block_offset;
/// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
OffsetT block_end;
/// Stride between input tiles
OffsetT block_stride;
/**
* \brief Constructor.
*/
__host__ __device__ __forceinline__ GridEvenShare() :
total_tiles(0),
big_shares(0),
big_share_items(0),
normal_share_items(0),
normal_base_offset(0),
num_items(0),
grid_size(0),
block_offset(0),
block_end(0),
block_stride(0)
{}
/**
* \brief Dispatch initializer. To be called prior to kernel launch.
*/
__host__ __device__ __forceinline__ void DispatchInit(
OffsetT num_items_, ///< Total number of input items
int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
int tile_items) ///< Number of data items per input tile
{
this->block_offset = num_items_; // Initialize past-the-end
this->block_end = num_items_; // Initialize past-the-end
this->num_items = num_items_;
this->total_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items_, tile_items));
this->grid_size = min(total_tiles, max_grid_size);
int avg_tiles_per_block = total_tiles / grid_size;
// leftover grains go to big blocks:
this->big_shares = total_tiles - (avg_tiles_per_block * grid_size);
this->normal_share_items = avg_tiles_per_block * tile_items;
this->normal_base_offset = big_shares * tile_items;
this->big_share_items = normal_share_items + tile_items;
}
/**
* \brief Initializes ranges for the specified thread block index. Specialized
* for a "raking" access pattern in which each thread block is assigned a
* consecutive sequence of input tiles.
*/
template <int TILE_ITEMS>
__device__ __forceinline__ void BlockInit(
int block_id,
Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
{
block_stride = TILE_ITEMS;
if (block_id < big_shares)
{
// This thread block gets a big share of grains (avg_tiles_per_block + 1)
block_offset = (block_id * big_share_items);
block_end = block_offset + big_share_items;
}
else if (block_id < total_tiles)
{
// This thread block gets a normal share of grains (avg_tiles_per_block)
block_offset = normal_base_offset + (block_id * normal_share_items);
block_end = min(num_items, block_offset + normal_share_items);
}
// Else default past-the-end
}
/**
* \brief Block-initialization, specialized for a "raking" access
* pattern in which each thread block is assigned a consecutive sequence
* of input tiles.
*/
template <int TILE_ITEMS>
__device__ __forceinline__ void BlockInit(
int block_id,
Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
{
block_stride = grid_size * TILE_ITEMS;
block_offset = (block_id * TILE_ITEMS);
block_end = num_items;
}
/**
* \brief Block-initialization, specialized for "strip mining" access
* pattern in which the input tiles assigned to each thread block are
* separated by a stride equal to the the extent of the grid.
*/
template <
int TILE_ITEMS,
GridMappingStrategy STRATEGY>
__device__ __forceinline__ void BlockInit()
{
BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
}
/**
* \brief Block-initialization, specialized for a "raking" access
* pattern in which each thread block is assigned a consecutive sequence
* of input tiles.
*/
template <int TILE_ITEMS>
__device__ __forceinline__ void BlockInit(
OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
OffsetT block_end) ///< [in] Threadblock end offset (exclusive)
{
this->block_offset = block_offset;
this->block_end = block_end;
this->block_stride = TILE_ITEMS;
}
};
/** @} */ // end group GridModule
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_GRID_GRID_EVEN_SHARE_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
#define HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../thread/thread_load.cuh"
BEGIN_HIPCUB_NAMESPACE
/**
* \addtogroup GridModule
* @{
*/
/******************************************************************************
* Mapping policies
*****************************************************************************/
/**
* \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
*/
enum GridMappingStrategy
{
/**
* \brief An a "raking" access pattern in which each thread block is
* assigned a consecutive sequence of input tiles
*
* \par Overview
* The input is evenly partitioned into \p p segments, where \p p is
* constant and corresponds loosely to the number of thread blocks that may
* actively reside on the target device. Each segment is comprised of
* consecutive tiles, where a tile is a small, constant-sized unit of input
* to be processed to completion before the thread block terminates or
* obtains more work. The kernel invokes \p p thread blocks, each
* of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
* in tile-size increments.
*/
GRID_MAPPING_RAKE,
/**
* \brief An a "strip mining" access pattern in which the input tiles assigned
* to each thread block are separated by a stride equal to the the extent of
* the grid.
*
* \par Overview
* The input is evenly partitioned into \p p sets, where \p p is
* constant and corresponds loosely to the number of thread blocks that may
* actively reside on the target device. Each set is comprised of
* data tiles separated by stride \p tiles, where a tile is a small,
* constant-sized unit of input to be processed to completion before the
* thread block terminates or obtains more work. The kernel invokes \p p
* thread blocks, each of which iteratively consumes a segment of
* <em>n</em>/<em>p</em> elements in tile-size increments.
*/
GRID_MAPPING_STRIP_MINE,
/**
* \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
*
* \par Overview
* The input is treated as a queue to be dynamically consumed by a grid of
* thread blocks. Work is atomically dequeued in tiles, where a tile is a
* unit of input to be processed to completion before the thread block
* terminates or obtains more work. The grid size \p p is constant,
* loosely corresponding to the number of thread blocks that may actively
* reside on the target device.
*/
GRID_MAPPING_DYNAMIC,
};
/** @} */ // end group GridModule
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_GRID_GRID_MAPPING_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
#define HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
#include <type_traits>
#include "../config.hpp"
BEGIN_HIPCUB_NAMESPACE
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridQueue is a descriptor utility for dynamic queue management.
*
* \par Overview
* GridQueue descriptors provides abstractions for "filling" or
* "draining" globally-shared vectors.
*
* \par
* A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
* returning a unique offset for the calling thread to write its items.
* The GridQueue maintains the total "fill-size". The fill counter must be reset
* using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
* will be filling.
*
* \par
* Similarly, a "draining" GridQueue works by works by atomically-incrementing a
* zero-initialized counter, returning a unique offset for the calling thread to
* read its items. Threads can safely drain until the array's logical fill-size is
* exceeded. The drain counter must be reset using GridQueue::ResetDrain or
* GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
* will be filling. (For dynamic work distribution of existing data, the corresponding fill-size
* is simply the number of elements in the array.)
*
* \par
* Iterative work management can be implemented simply with a pair of flip-flopping
* work buffers, each with an associated set of fill and drain GridQueue descriptors.
*
* \tparam OffsetT Signed integer type for global offsets
*/
template <typename OffsetT>
class GridQueue
{
private:
/// Counter indices
enum
{
FILL = 0,
DRAIN = 1,
};
/// Pair of counters
OffsetT *d_counters;
public:
/// Returns the device allocation size in bytes needed to construct a GridQueue instance
__host__ __device__ __forceinline__
static size_t AllocationSize()
{
return sizeof(OffsetT) * 2;
}
/// Constructs an invalid GridQueue descriptor
__host__ __device__ __forceinline__ GridQueue()
:
d_counters(NULL)
{}
/// Constructs a GridQueue descriptor around the device storage allocation
__host__ __device__ __forceinline__ GridQueue(
void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as <tt>AllocationSize()</tt>.
:
d_counters((OffsetT*) d_storage)
{}
/// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining.
HIPCUB_DEVICE cudaError_t FillAndResetDrain(
OffsetT fill_size,
cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
(void)stream;
d_counters[FILL] = fill_size;
d_counters[DRAIN] = 0;
result = cudaSuccess;
return result;
}
HIPCUB_HOST cudaError_t FillAndResetDrain(
OffsetT fill_size,
cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
OffsetT counters[2];
counters[FILL] = fill_size;
counters[DRAIN] = 0;
result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
return result;
}
/// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining.
HIPCUB_DEVICE cudaError_t ResetDrain(cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
(void)stream;
d_counters[DRAIN] = 0;
result = cudaSuccess;
return result;
}
HIPCUB_HOST cudaError_t ResetDrain(cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
return result;
}
/// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling.
HIPCUB_DEVICE cudaError_t ResetFill(cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
(void)stream;
d_counters[FILL] = 0;
result = cudaSuccess;
return result;
}
HIPCUB_HOST cudaError_t ResetFill(cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
return result;
}
/// Returns the fill-size established by the parent or by the previous kernel.
HIPCUB_DEVICE cudaError_t FillSize(
OffsetT &fill_size,
cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
(void)stream;
fill_size = d_counters[FILL];
result = cudaSuccess;
return result;
}
HIPCUB_HOST cudaError_t FillSize(
OffsetT &fill_size,
cudaStream_t stream = 0)
{
cudaError_t result = cudaErrorUnknown;
result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
return result;
}
/// Drain \p num_items from the queue. Returns offset from which to read items. To be called from cuda kernel.
HIPCUB_DEVICE OffsetT Drain(OffsetT num_items)
{
return atomicAdd(d_counters + DRAIN, num_items);
}
/// Fill \p num_items into the queue. Returns offset from which to write items. To be called from cuda kernel.
HIPCUB_DEVICE OffsetT Fill(OffsetT num_items)
{
return atomicAdd(d_counters + FILL, num_items);
}
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Reset grid queue (call with 1 block of 1 thread)
*/
template <typename OffsetT>
__global__ void FillAndResetDrainKernel(
GridQueue<OffsetT> grid_queue,
OffsetT num_items)
{
grid_queue.FillAndResetDrain(num_items);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/** @} */ // end group GridModule
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_GRID_GRID_QUEUE_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/arg_index_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template<
typename InputIterator,
typename Difference = std::ptrdiff_t,
typename Value = typename std::iterator_traits<InputIterator>::value_type
>
using ArgIndexInputIterator = ::rocprim::arg_index_iterator<InputIterator, Difference, Value>;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_ARG_INDEX_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../thread/thread_load.cuh"
#include "../util_type.cuh"
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
template <
CacheLoadModifier MODIFIER,
typename ValueType,
typename OffsetT = ptrdiff_t>
class CacheModifiedInputIterator
{
public:
// Required iterator traits
typedef CacheModifiedInputIterator self_type; ///< My own type
typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
typedef ValueType value_type; ///< The type of the element the iterator can point to
typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
typedef ValueType reference; ///< The type of a reference to an element the iterator can point to
typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
public:
/// Wrapped native pointer
ValueType* ptr;
/// Constructor
__host__ __device__ __forceinline__ CacheModifiedInputIterator(
ValueType* ptr) ///< Native pointer to wrap
:
ptr(const_cast<typename std::remove_cv<ValueType>::type *>(ptr))
{}
/// Postfix increment
__host__ __device__ __forceinline__ self_type operator++(int)
{
self_type retval = *this;
ptr++;
return retval;
}
/// Prefix increment
__host__ __device__ __forceinline__ self_type operator++()
{
ptr++;
return *this;
}
/// Indirection
__device__ __forceinline__ reference operator*() const
{
return ThreadLoad<MODIFIER>(ptr);
}
/// Addition
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator+(Distance n) const
{
self_type retval(ptr + n);
return retval;
}
/// Addition assignment
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator+=(Distance n)
{
ptr += n;
return *this;
}
/// Subtraction
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator-(Distance n) const
{
self_type retval(ptr - n);
return retval;
}
/// Subtraction assignment
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator-=(Distance n)
{
ptr -= n;
return *this;
}
/// Distance
__host__ __device__ __forceinline__ difference_type operator-(self_type other) const
{
return ptr - other.ptr;
}
/// Array subscript
template <typename Distance>
__device__ __forceinline__ reference operator[](Distance n) const
{
return ThreadLoad<MODIFIER>(ptr + n);
}
/// Structure dereference
__device__ __forceinline__ pointer operator->()
{
return &ThreadLoad<MODIFIER>(ptr);
}
/// Equal to
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (ptr == rhs.ptr);
}
/// Not equal to
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (ptr != rhs.ptr);
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// ostream operator
friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
{
return os;
}
#endif
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_OUTPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_CACHE_MODIFIED_OUTPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../thread/thread_load.cuh"
#include "../thread/thread_store.cuh"
#include "../util_type.cuh"
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
template <
CacheStoreModifier MODIFIER,
typename ValueType,
typename OffsetT = ptrdiff_t>
class CacheModifiedOutputIterator
{
private:
// Proxy object
struct Reference
{
ValueType* ptr;
/// Constructor
__host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
/// Assignment
__device__ __forceinline__ ValueType operator =(ValueType val)
{
ThreadStore<MODIFIER>(ptr, val);
return val;
}
};
public:
// Required iterator traits
typedef CacheModifiedOutputIterator self_type; ///< My own type
typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
typedef void value_type; ///< The type of the element the iterator can point to
typedef void pointer; ///< The type of a pointer to an element the iterator can point to
typedef Reference reference; ///< The type of a reference to an element the iterator can point to
typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
private:
ValueType* ptr;
public:
/// Constructor
template <typename QualifiedValueType>
__host__ __device__ __forceinline__ CacheModifiedOutputIterator(
QualifiedValueType* ptr) ///< Native pointer to wrap
:
ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type *>(ptr))
{}
/// Postfix increment
__host__ __device__ __forceinline__ self_type operator++(int)
{
self_type retval = *this;
ptr++;
return retval;
}
/// Prefix increment
__host__ __device__ __forceinline__ self_type operator++()
{
ptr++;
return *this;
}
/// Indirection
__host__ __device__ __forceinline__ reference operator*() const
{
return Reference(ptr);
}
/// Addition
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator+(Distance n) const
{
self_type retval(ptr + n);
return retval;
}
/// Addition assignment
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator+=(Distance n)
{
ptr += n;
return *this;
}
/// Subtraction
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator-(Distance n) const
{
self_type retval(ptr - n);
return retval;
}
/// Subtraction assignment
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator-=(Distance n)
{
ptr -= n;
return *this;
}
/// Distance
__host__ __device__ __forceinline__ difference_type operator-(self_type other) const
{
return ptr - other.ptr;
}
/// Array subscript
template <typename Distance>
__host__ __device__ __forceinline__ reference operator[](Distance n) const
{
return Reference(ptr + n);
}
/// Equal to
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (ptr == rhs.ptr);
}
/// Not equal to
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (ptr != rhs.ptr);
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// ostream operator
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
(void)itr;
return os;
}
#endif
};
END_HIPCUB_NAMESPACE
#endif
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/constant_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template<
typename ValueType,
typename OffsetT = std::ptrdiff_t
>
using ConstantInputIterator = ::rocprim::constant_iterator<ValueType, OffsetT>;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_CONSTANT_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/counting_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template<
typename ValueType,
typename OffsetT = std::ptrdiff_t
>
using CountingInputIterator = ::rocprim::counting_iterator<ValueType, OffsetT>;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_COUNTING_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
BEGIN_HIPCUB_NAMESPACE
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
/**
* \addtogroup UtilIterator
* @{
*/
/**
* \brief A discard iterator
*/
template <typename OffsetT = ptrdiff_t>
class DiscardOutputIterator
{
public:
// Required iterator traits
typedef DiscardOutputIterator self_type; ///< My own type
typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
typedef void value_type; ///< The type of the element the iterator can point to
typedef void pointer; ///< The type of a pointer to an element the iterator can point to
typedef void reference; ///< The type of a reference to an element the iterator can point to
#if (THRUST_VERSION >= 100700)
// Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
typedef typename thrust::detail::iterator_facade_category<
thrust::any_system_tag,
thrust::random_access_traversal_tag,
value_type,
reference
>::type iterator_category; ///< The iterator category
#else
typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
#endif // THRUST_VERSION
private:
OffsetT offset;
public:
/// Constructor
__host__ __device__ __forceinline__ DiscardOutputIterator(
OffsetT offset = 0) ///< Base offset
:
offset(offset)
{}
/**
* @typedef self_type
* @brief Postfix increment
*/
__host__ __device__ __forceinline__ self_type operator++(int)
{
self_type retval = *this;
offset++;
return retval;
}
/**
* @typedef self_type
* @brief Postfix increment
*/
__host__ __device__ __forceinline__ self_type operator++()
{
offset++;
return *this;
}
/**
* @typedef self_type
* @brief Indirection
*/
__host__ __device__ __forceinline__ self_type& operator*()
{
// return self reference, which can be assigned to anything
return *this;
}
/**
* @typedef self_type
* @brief Addition
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator+(Distance n) const
{
self_type retval(offset + n);
return retval;
}
/**
* @typedef self_type
* @brief Addition assignment
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator+=(Distance n)
{
offset += n;
return *this;
}
/**
* @typedef self_type
* @brief Subtraction assignment
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type operator-(Distance n) const
{
self_type retval(offset - n);
return retval;
}
/**
* @typedef self_type
* @brief Subtraction assignment
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator-=(Distance n)
{
offset -= n;
return *this;
}
/**
* @typedef self_type
* @brief Distance
*/
__host__ __device__ __forceinline__ difference_type operator-(self_type other) const
{
return offset - other.offset;
}
/**
* @typedef self_type
* @brief Array subscript
*/
template <typename Distance>
__host__ __device__ __forceinline__ self_type& operator[](Distance)
{
// return self reference, which can be assigned to anything
return *this;
}
/// Structure dereference
__host__ __device__ __forceinline__ pointer operator->()
{
return;
}
/// Assignment to anything else (no-op)
template<typename T>
__host__ __device__ __forceinline__ void operator=(T const&)
{}
/// Cast to void* operator
__host__ __device__ __forceinline__ operator void*() const { return NULL; }
/**
* @typedef self_type
* @brief Equal to
*/
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (offset == rhs.offset);
}
/**
* @typedef self_type
* @brief Not equal to
*/
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (offset != rhs.offset);
}
/**
* @typedef self_type
* @brief ostream operator
*/
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "[" << itr.offset << "]";
return os;
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_DISCARD_OUTPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
#include <cub/rocprim/iterator/texture_cache_iterator.hpp>
BEGIN_HIPCUB_NAMESPACE
template<
typename T,
typename OffsetT = std::ptrdiff_t
>
class TexObjInputIterator : public ::rocprim::texture_cache_iterator<T, OffsetT>
{
public:
template<class Qualified>
inline
cudaError_t BindTexture(Qualified* ptr,
size_t bytes = size_t(-1),
size_t texture_offset = 0)
{
return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::bind_texture(ptr, bytes, texture_offset);
}
inline cudaError_t UnbindTexture()
{
return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::unbind_texture();
}
HIPCUB_HOST_DEVICE inline
~TexObjInputIterator() = default;
HIPCUB_HOST_DEVICE inline
TexObjInputIterator() : ::rocprim::texture_cache_iterator<T, OffsetT>()
{
}
HIPCUB_HOST_DEVICE inline
TexObjInputIterator(const ::rocprim::texture_cache_iterator<T, OffsetT> other)
: ::rocprim::texture_cache_iterator<T, OffsetT>(other)
{
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2021, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_TEX_REF_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
#include <rocprim/iterator/texture_cache_iterator.hpp>
BEGIN_HIPCUB_NAMESPACE
template<
typename T,
int UNIQUE_ID, // Unused parameter for compatibility with original definition in cub
typename OffsetT = std::ptrdiff_t
>
class TexRefInputIterator : public ::rocprim::texture_cache_iterator<T, OffsetT>
{
public:
template<class Qualified>
inline
cudaError_t BindTexture(Qualified* ptr,
size_t bytes = size_t(-1),
size_t texture_offset = 0)
{
return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::bind_texture(ptr, bytes, texture_offset);
}
inline cudaError_t UnbindTexture()
{
return (cudaError_t)::rocprim::texture_cache_iterator<T, OffsetT>::unbind_texture();
}
HIPCUB_HOST_DEVICE inline
~TexRefInputIterator() = default;
HIPCUB_HOST_DEVICE inline
TexRefInputIterator() : ::rocprim::texture_cache_iterator<T, OffsetT>()
{
}
HIPCUB_HOST_DEVICE inline
TexRefInputIterator(const ::rocprim::texture_cache_iterator<T, OffsetT> other)
: ::rocprim::texture_cache_iterator<T, OffsetT>(other)
{
}
};
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_TEX_OBJ_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
#define HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
#include <iterator>
#include <iostream>
#include "../config.hpp"
#include <cub/rocprim/iterator/transform_iterator.hpp>
#if (THRUST_VERSION >= 100700)
// This iterator is compatible with Thrust API 1.7 and newer
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_traits.h>
#endif // THRUST_VERSION
BEGIN_HIPCUB_NAMESPACE
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template<
typename ValueType,
typename ConversionOp,
typename InputIteratorT,
typename OffsetT = std::ptrdiff_t // ignored
>
using TransformInputIterator = ::rocprim::transform_iterator<InputIteratorT, ConversionOp, ValueType>;
#endif
END_HIPCUB_NAMESPACE
#endif // HIPCUB_ROCPRIM_ITERATOR_TRANSFORM_INPUT_ITERATOR_HPP_
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#ifndef ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
#define ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
#include "detail/block_adjacent_difference_impl.hpp"
#include "../config.hpp"
#include "../detail/various.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The \p block_adjacent_difference class is a block level parallel primitive which provides
/// methods for applying binary functions for pairs of consecutive items partition across a thread
/// block.
///
/// \tparam T - the input type.
/// \tparam BlockSize - the number of threads in a block.
///
/// \par Overview
/// * There are two types of flags:
/// * Head flags.
/// * Tail flags.
/// * The above flags are used to differentiate items from their predecessors or successors.
/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
/// segmented reduction/scan.
///
/// \par Examples
/// \parblock
/// In the examples discontinuity operation is performed on block of 128 threads, using type
/// \p int.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_adjacent_difference
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
: private detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>
#endif // DOXYGEN_SHOULD_SKIP_THIS
{
using base_type = detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
static constexpr unsigned BlockSize = base_type::BlockSize;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct storage_type_
{
typename base_type::storage_type left;
typename base_type::storage_type right;
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = detail::raw_storage<storage_type_>;
#else
using storage_type = storage_type_;
#endif
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item has no reference and is always
/// flagged.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, head_flags, flag_op, input[0] /* predecessor */, storage.get().left);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
/// This overload does not take a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads(head_flags, input, flag_op, storage);
}
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item of the first thread is compared against
/// a \p tile_predecessor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, head_flags, flag_op, tile_predecessor_item, storage.get().left);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_left() or block_discontinuity::flag_heads() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_left or block_discontinuity.flag_heads instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads(head_flags, tile_predecessor_item, input, flag_op, storage);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item has no reference and is always
/// flagged.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_successor = false;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, tail_flags, flag_op, input[0] /* successor */, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_tails(tail_flags, input, flag_op, storage);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item of the last thread is compared against
/// a \p tile_successor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_successor = true;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use subtract_right() or block_discontinuity::flag_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use subtract_right or block_discontinuity.flag_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_tails(tail_flags, tile_successor_item, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
/// flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
static constexpr auto with_successor = false;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tail_flags, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the last item of the
/// last thread is compared against a \p tile_successor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
static constexpr auto with_successor = true;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first item of the
/// first thread is compared against a \p tile_predecessor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
static constexpr auto with_successor = false;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first and last items of
/// the first and last thread is compared against a \p tile_predecessor_item and
/// a \p tile_successor_item.
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reuse
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_adjacent_difference_int = rocprim::block_adjacent_difference<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_adjacent_difference_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_predecessor_item = 0;
/// int tile_successor_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_predecessor_item = ...
/// tile_successor_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_adjacent_difference_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
/// tail_flags, tile_successor_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
static constexpr auto with_successor = true;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// \deprecated The flags API of block_adjacent_difference is deprecated,
/// use block_discontinuity::flag_heads_and_tails() instead.
///
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
[[deprecated("The flags API of block_adjacent_difference is deprecated."
"Use block_discontinuity.flag_heads_and_tails instead.")]]
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(
head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
input, flag_op, storage
);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item.
///
/// The first item in the first thread is copied from the input then for the rest the following
/// code applies.
/// \code
/// // For each i in [1, block_size * ItemsPerThread) across threads in a block
/// output[i] = op(input[i], input[i-1]);
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param storage reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, output, op, input[0] /* predecessor */, storage.get().left);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item, with an explicit item before
/// the tile.
///
/// \code
/// // For the first item on the first thread use the tile predecessor
/// output[0] = op(input[0], tile_predecessor)
/// // For other items, i in [1, block_size * ItemsPerThread) across threads in a block
/// output[i] = op(input[i], input[i-1]);
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] tile_predecessor - the item before the tile, will be used as the input
/// of the first application of `op`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const T tile_predecessor,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, output, op, tile_predecessor, storage.get().left);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item, in a partial tile.
///
/// \code
/// output[0] = input[0]
/// // For each item i in [1, valid_items) across threads in a block
/// output[i] = op(input[i], input[i-1]);
/// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
/// output[i] = input[i]
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] valid_items - number of items in the block which are considered "valid" and will
/// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left_partial(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const unsigned int valid_items,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = false;
base_type::template apply_left_partial<as_flags, reversed, with_predecessor>(
input, output, op, input[0] /* predecessor */, valid_items, storage.get().left);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the left item, in a partial tile with a
/// predecessor.
///
/// This combines subtract_left_partial() with a tile predecessor.
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] tile_predecessor - the item before the tile, will be used as the input
/// of the first application of `op`
/// \param [in] valid_items - number of items in the block which are considered "valid" and will
/// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_left_partial(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const T tile_predecessor,
const unsigned int valid_items,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = true;
static constexpr auto with_predecessor = true;
base_type::template apply_left_partial<as_flags, reversed, with_predecessor>(
input, output, op, tile_predecessor, valid_items, storage.get().left);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the right item.
///
/// The last item in the last thread is copied from the input then for the rest the following
/// code applies.
/// \code
/// // For each i in [0, block_size * ItemsPerThread - 1) across threads in a block
/// output[i] = op(input[i], input[i+1]);
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = false;
static constexpr auto with_successor = false;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, output, op, input[0] /* successor */, storage.get().right);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the right item, with an explicit item after
/// the tile.
///
/// \code
/// // For each items i in [0, block_size * ItemsPerThread - 1) across threads in a block
/// output[i] = op(input[i], input[i+1]);
/// // For the last item on the last thread use the tile successor
/// output[block_size * ItemsPerThread - 1] =
/// op(input[block_size * ItemsPerThread - 1], tile_successor)
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] tile_successor - the item after the tile, will be used as the input
/// of the last application of `op`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const T tile_successor,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = false;
static constexpr auto with_successor = true;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, output, op, tile_successor, storage.get().right);
}
/// \brief Apply a function to each consecutive pair of elements partitioned across threads in
/// the block and write the output to the position of the right item, in a partial tile.
///
/// \code
/// // For each item i in [0, valid_items) across threads in a block
/// output[i] = op(input[i], input[i + 1]);
/// // Just copy "invalid" items in [valid_items, block_size * ItemsPerThread)
/// output[i] = input[i]
/// \endcode
///
/// \tparam Output - [inferred] the type of output, must be assignable from the result of `op`
/// \tparam ItemsPerThread - [inferred] the number of items processed by each thread
/// \tparam BinaryFunction - [inferred] the type of the function to apply
/// \param [in] input - array that data is loaded from partitioned across the threads in the block
/// \param [out] output - array where the result of function application will be written to
/// \param [in] op - binary function applied to the items.
/// The signature of the function should be equivalent to the following:
/// `bool f(const T &a, const T &b)` The signature does not need to have
/// `const &` but the function object must not modify the objects passed to it.
/// \param [in] valid_items - number of items in the block which are considered "valid" and will
/// be used. Must be less or equal to `BlockSize` * `ItemsPerThread`
/// \param storage - reference to a temporary storage object of type #storage_type
/// \par Storage reuse
/// Synchronization barrier should be placed before `storage` is reused
/// or repurposed: `__syncthreads()` or \link syncthreads() rocprim::syncthreads() \endlink.
template <typename Output, unsigned int ItemsPerThread, typename BinaryFunction>
ROCPRIM_DEVICE ROCPRIM_INLINE void subtract_right_partial(const T (&input)[ItemsPerThread],
Output (&output)[ItemsPerThread],
const BinaryFunction op,
const unsigned int valid_items,
storage_type& storage)
{
static constexpr auto as_flags = false;
static constexpr auto reversed = false;
base_type::template apply_right_partial<as_flags, reversed>(
input, output, op, valid_items, storage.get().right);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_ADJACENT_DIFFERENCE_HPP_
// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
#define ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
#include "detail/block_adjacent_difference_impl.hpp"
#include "../config.hpp"
#include "../detail/various.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The \p block_discontinuity class is a block level parallel primitive which provides
/// methods for flagging items that are discontinued within an ordered set of items across
/// threads in a block.
///
/// \tparam T - the input type.
/// \tparam BlockSize - the number of threads in a block.
///
/// \par Overview
/// * There are two types of flags:
/// * Head flags.
/// * Tail flags.
/// * The above flags are used to differentiate items from their predecessors or successors.
/// * E.g. Head flags are convenient for differentiating disjoint data segments as part of a
/// segmented reduction/scan.
///
/// \par Examples
/// \parblock
/// In the examples discontinuity operation is performed on block of 128 threads, using type
/// \p int.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_discontinuity
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hide implementation detail from documentation
: private detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>
#endif // DOXYGEN_SHOULD_SKIP_THIS
{
using base_type = detail::block_adjacent_difference_impl<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
static constexpr unsigned BlockSize = base_type::BlockSize;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct storage_type_
{
typename base_type::storage_type left;
typename base_type::storage_type right;
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = detail::raw_storage<storage_type_>;
#else
using storage_type = storage_type_;
#endif
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item has no reference and is always
/// flagged.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = false;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, head_flags, flag_op, input[0] /* predecessor */, storage.get().left);
}
/// \overload
/// This overload does not take a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads(head_flags, input, flag_op, storage);
}
/// \brief Tags \p head_flags that indicate discontinuities between items partitioned
/// across the thread block, where the first item of the first thread is compared against
/// a \p tile_predecessor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads(head_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = true;
base_type::template apply_left<as_flags, reversed, with_predecessor>(
input, head_flags, flag_op, tile_predecessor_item, storage.get().left);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads(head_flags, tile_predecessor_item, input, flag_op, storage);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item has no reference and is always
/// flagged.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, input, flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_successor = false;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, tail_flags, flag_op, input[0] /* successor */, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_tails(tail_flags, input, flag_op, storage);
}
/// \brief Tags \p tail_flags that indicate discontinuities between items partitioned
/// across the thread block, where the last item of the last thread is compared against
/// a \p tile_successor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_tails(tail_flags, tile_item, input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_successor = true;
base_type::template apply_right<as_flags, reversed, with_successor>(
input, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_tails(Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_tails(tail_flags, tile_successor_item, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, input,
/// flag_op_type(), storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = false;
static constexpr auto with_successor = false;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tail_flags, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the last item of the
/// last thread is compared against a \p tile_successor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = false;
static constexpr auto with_successor = true;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, items[0] /*predecessor*/, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first item of the
/// first thread is compared against a \p tile_predecessor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_item, tail_flags,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = true;
static constexpr auto with_successor = false;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, items[0] /*successor*/, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, flag_op, storage);
}
/// \brief Tags both \p head_flags and\p tail_flags that indicate discontinuities
/// between items partitioned across the thread block, where the first and last items of
/// the first and last thread is compared against a \p tile_predecessor_item and
/// a \p tile_successor_item.
///
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread.
/// \tparam Flag - [inferred] the flag type.
/// \tparam FlagOp - [inferred] type of binary function used for flagging.
///
/// \param [out] head_flags - array that contains the head flags.
/// \param [in] tile_predecessor_item - first tile item from thread to be compared
/// against.
/// \param [out] tail_flags - array that contains the tail flags.
/// \param [in] tile_successor_item - last tile item from thread to be compared
/// against.
/// \param [in] input - array that data is loaded from.
/// \param [in] flag_op - binary operation function object that will be used for flagging.
/// The signature of the function should be equivalent to the following:
/// <tt>bool f(const T &a, const T &b);</tt> or <tt>bool (const T& a, const T& b, unsigned int b_index);</tt>.
/// The signature does not need to have <tt>const &</tt>, but function object
/// must not modify the objects passed to it.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize discontinuity for int and a block of 128 threads
/// using block_discontinuity_int = rocprim::block_discontinuity<int, 128>;
/// // allocate storage in shared memory
/// __shared__ block_discontinuity_int::storage_type storage;
///
/// // segment of consecutive items to be used
/// int input[8];
/// int tile_predecessor_item = 0;
/// int tile_successor_item = 0;
/// if (threadIdx.x == 0)
/// {
/// tile_predecessor_item = ...
/// tile_successor_item = ...
/// }
/// ...
/// int head_flags[8];
/// int tail_flags[8];
/// block_discontinuity_int b_discontinuity;
/// using flag_op_type = typename rocprim::greater<int>;
/// b_discontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item,
/// tail_flags, tile_successor_item,
/// input, flag_op_type(),
/// storage);
/// ...
/// }
/// \endcode
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op,
storage_type& storage)
{
static constexpr auto as_flags = true;
static constexpr auto reversed = false;
static constexpr auto with_predecessor = true;
static constexpr auto with_successor = true;
// Copy items in case head_flags is aliased with input
T items[ItemsPerThread];
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i) {
items[i] = input[i];
}
base_type::template apply_left<as_flags, reversed, with_predecessor>(
items, head_flags, flag_op, tile_predecessor_item, storage.get().left);
base_type::template apply_right<as_flags, reversed, with_successor>(
items, tail_flags, flag_op, tile_successor_item, storage.get().right);
}
/// \overload
/// This overload does not accept a reference to temporary storage, instead it is declared as
/// part of the function itself. Note that this does NOT decrease the shared memory requirements
/// of a kernel using this function.
template<unsigned int ItemsPerThread, class Flag, class FlagOp>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void flag_heads_and_tails(Flag (&head_flags)[ItemsPerThread],
T tile_predecessor_item,
Flag (&tail_flags)[ItemsPerThread],
T tile_successor_item,
const T (&input)[ItemsPerThread],
FlagOp flag_op)
{
ROCPRIM_SHARED_MEMORY storage_type storage;
flag_heads_and_tails(
head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
input, flag_op, storage
);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_DISCONTINUITY_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
#define ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief The \p block_exchange class is a block level parallel primitive which provides
/// methods for rearranging items partitioned across threads in a block.
///
/// \tparam T - the input type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items contributed by each thread.
///
/// \par Overview
/// * The \p block_exchange class supports the following rearrangement methods:
/// * Transposing a blocked arrangement to a striped arrangement.
/// * Transposing a striped arrangement to a blocked arrangement.
/// * Transposing a blocked arrangement to a warp-striped arrangement.
/// * Transposing a warp-striped arrangement to a blocked arrangement.
/// * Scattering items to a blocked arrangement.
/// * Scattering items to a striped arrangement.
/// * Data is automatically be padded to ensure zero bank conflicts.
///
/// \par Examples
/// \parblock
/// In the examples exchange operation is performed on block of 128 threads, using type
/// \p int with 8 items per thread.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.blocked_to_striped(items, items, storage);
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_exchange
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
// Select warp size
static constexpr unsigned int warp_size =
detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
// Number of warps in block
static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size;
// Minimize LDS bank conflicts for power-of-two strides, i.e. when items accessed
// using `thread_id * ItemsPerThread` pattern where ItemsPerThread is power of two
// (all exchanges from/to blocked).
static constexpr bool has_bank_conflicts =
ItemsPerThread >= 2 && ::rocprim::detail::is_power_of_two(ItemsPerThread);
static constexpr unsigned int banks_no = ::rocprim::detail::get_lds_banks_no();
static constexpr unsigned int bank_conflicts_padding =
has_bank_conflicts ? (BlockSize * ItemsPerThread / banks_no) : 0;
// Struct used for creating a raw_storage object for this primitive's temporary storage.
struct storage_type_
{
T buffer[BlockSize * ItemsPerThread + bank_conflicts_padding];
};
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = detail::raw_storage<storage_type_>;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
/// \brief Transposes a blocked arrangement of items to a striped arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template<class U>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void blocked_to_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
blocked_to_striped(input, output, storage);
}
/// \brief Transposes a blocked arrangement of items to a striped arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.blocked_to_striped(items, items, storage);
/// ...
/// }
/// \endcode
template<class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void blocked_to_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(flat_id * ItemsPerThread + i)] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(i * BlockSize + flat_id)];
}
}
/// \brief Transposes a striped arrangement of items to a blocked arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template<class U>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void striped_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
striped_to_blocked(input, output, storage);
}
/// \brief Transposes a striped arrangement of items to a blocked arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.striped_to_blocked(items, items, storage);
/// ...
/// }
/// \endcode
template<class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void striped_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
}
}
/// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template<class U>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
blocked_to_warp_striped(input, output, storage);
}
/// \brief Transposes a blocked arrangement of items to a warp-striped arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.blocked_to_warp_striped(items, items, storage);
/// ...
/// }
/// \endcode
template<class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
storage_type& storage)
{
constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
const unsigned int lane_id = ::rocprim::lane_id();
const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
const unsigned int current_warp_size = get_current_warp_size();
const unsigned int offset = warp_id * items_per_warp;
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(offset + lane_id * ItemsPerThread + i)] = input[i];
}
::rocprim::wave_barrier();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(offset + i * current_warp_size + lane_id)];
}
}
/// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
template<class U>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
warp_striped_to_blocked(input, output, storage);
}
/// \brief Transposes a warp-striped arrangement of items to a blocked arrangement
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.warp_striped_to_blocked(items, items, storage);
/// ...
/// }
/// \endcode
template<class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
storage_type& storage)
{
constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
const unsigned int lane_id = ::rocprim::lane_id();
const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
const unsigned int current_warp_size = get_current_warp_size();
const unsigned int offset = warp_id * items_per_warp;
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(offset + i * current_warp_size + lane_id)] = input[i];
}
::rocprim::wave_barrier();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(offset + lane_id * ItemsPerThread + i)];
}
}
/// \brief Scatters items to a blocked arrangement based on their ranks
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void scatter_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
scatter_to_blocked(input, output, ranks, storage);
}
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void gather_from_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
gather_from_striped(input, output, ranks, storage);
}
/// \brief Scatters items to a blocked arrangement based on their ranks
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_blocked(items, items, ranks, storage);
/// ...
/// }
/// \endcode
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_INLINE
void scatter_to_blocked(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
storage_.buffer[index(rank)] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
}
}
template <class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_INLINE
void gather_from_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
output[i] = storage_.buffer[index(rank)];
}
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void scatter_to_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
scatter_to_striped(input, output, ranks, storage);
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, using temporary storage.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [out] ranks - array that has rank of data.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_striped(items, items, ranks, storage);
/// ...
/// }
/// \endcode
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_INLINE
void scatter_to_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
storage_.buffer[rank] = input[i];
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[i * BlockSize + flat_id];
}
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, guarded by rank.
///
/// \par Overview
/// * Items with rank -1 are not scattered.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
scatter_to_striped_guarded(input, output, ranks, storage);
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, guarded by rank, using temporary storage.
///
/// \par Overview
/// * Items with rank -1 are not scattered.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_striped_guarded(items, items, ranks, storage);
/// ...
/// }
/// \endcode
template<class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_INLINE
void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
if(rank >= 0)
{
storage_.buffer[rank] = input[i];
}
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[i * BlockSize + flat_id];
}
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, with a flag to denote validity.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
/// \tparam ValidFlag - [inferred] the validity flag type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
/// \param [in] is_valid - array that has flags to denote validity.
template<class U, class Offset, class ValidFlag>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
const ValidFlag (&is_valid)[ItemsPerThread])
{
ROCPRIM_SHARED_MEMORY storage_type storage;
scatter_to_striped_flagged(input, output, ranks, is_valid, storage);
}
/// \brief Scatters items to a striped arrangement based on their ranks
/// across the thread block, with a flag to denote validity, using temporary
/// storage.
///
/// \tparam U - [inferred] the output type.
/// \tparam Offset - [inferred] the rank type.
/// \tparam ValidFlag - [inferred] the validity flag type.
///
/// \param [in] input - array that data is loaded from.
/// \param [out] output - array that data is loaded to.
/// \param [in] ranks - array that has rank of data.
/// \param [in] is_valid - array that has flags to denote validity.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_exchange for int, block of 128 threads and 8 items per thread
/// using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
/// // allocate storage in shared memory
/// __shared__ block_exchange_int::storage_type storage;
///
/// int items[8];
/// int ranks[8];
/// int flags[8];
/// ...
/// block_exchange_int b_exchange;
/// b_exchange.scatter_to_striped_flagged(items, items, ranks, flags, storage);
/// ...
/// }
/// \endcode
template<class U, class Offset, class ValidFlag>
ROCPRIM_DEVICE ROCPRIM_INLINE
void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
const ValidFlag (&is_valid)[ItemsPerThread],
storage_type& storage)
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
storage_type_& storage_ = storage.get();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
const Offset rank = ranks[i];
if(is_valid[i])
{
storage_.buffer[rank] = input[i];
}
}
::rocprim::syncthreads();
for(unsigned int i = 0; i < ItemsPerThread; i++)
{
output[i] = storage_.buffer[i * BlockSize + flat_id];
}
}
private:
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int get_current_warp_size() const
{
const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
return (warp_id == warps_no - 1)
? (BlockSize % warp_size > 0 ? BlockSize % warp_size : warp_size)
: warp_size;
}
// Change index to minimize LDS bank conflicts if necessary
ROCPRIM_DEVICE ROCPRIM_INLINE
unsigned int index(unsigned int n)
{
// Move every 32-bank wide "row" (32 banks * 4 bytes) by one item
return has_bank_conflicts ? (n + n / banks_no) : n;
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
#define ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
#include <type_traits>
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "detail/block_histogram_atomic.hpp"
#include "detail/block_histogram_sort.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup blockmodule
/// @{
/// \brief Available algorithms for block_histogram primitive.
enum class block_histogram_algorithm
{
/// Atomic addition is used to update bin count directly.
/// \par Performance Notes:
/// * Performance is dependent on hardware implementation of atomic addition.
/// * Performance may decrease for non-uniform random input distributions
/// where many concurrent updates may be made to the same bin counter.
using_atomic,
/// A two-phase operation is used:-
/// * Data is sorted using radix-sort.
/// * "Runs" of same-valued keys are detected using discontinuity; run-lengths
/// are bin counts.
/// \par Performance Notes:
/// * Performance is consistent regardless of sample bin distribution.
using_sort,
/// \brief Default block_histogram algorithm.
default_algorithm = using_atomic,
};
namespace detail
{
// Selector for block_histogram algorithm which gives block histogram implementation
// type based on passed block_histogram_algorithm enum
template<block_histogram_algorithm Algorithm>
struct select_block_histogram_impl;
template<>
struct select_block_histogram_impl<block_histogram_algorithm::using_atomic>
{
template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, unsigned int ItemsPerThread, unsigned int Bins>
using type = block_histogram_atomic<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
};
template<>
struct select_block_histogram_impl<block_histogram_algorithm::using_sort>
{
template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, unsigned int ItemsPerThread, unsigned int Bins>
using type = block_histogram_sort<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
};
} // end namespace detail
/// \brief The block_histogram class is a block level parallel primitive which provides methods
/// for constructing block-wide histograms from items partitioned across threads in a block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items to be processed by each thread.
/// \tparam Bins - the number of bins within the histogram.
/// \tparam Algorithm - selected histogram algorithm, block_histogram_algorithm::default_algorithm by default.
///
/// \par Overview
/// * block_histogram has two alternative implementations: \p block_histogram_algorithm::using_atomic
/// and block_histogram_algorithm::using_sort.
///
/// \par Examples
/// \parblock
/// In the examples histogram operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_histogram for int, logical block of 192 threads,
/// // 2 items per thread and a bin size of 192.
/// using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
/// // allocate storage in shared memory
/// __shared__ block_histogram_int::storage_type storage;
/// __shared__ int hist[192];
///
/// int value[2];
/// ...
/// // execute histogram
/// block_histogram_int().histogram(
/// value, // input
/// hist, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int Bins,
block_histogram_algorithm Algorithm = block_histogram_algorithm::default_algorithm,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_histogram
#ifndef DOXYGEN_SHOULD_SKIP_THIS
: private detail::select_block_histogram_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>
#endif
{
using base_type = typename detail::select_block_histogram_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ, ItemsPerThread, Bins>;
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords <tt>__shared__</tt>. It can be aliased to
/// an externally allocated memory, or be a part of a union type with other storage types
/// to increase shared memory reusability.
using storage_type = typename base_type::storage_type;
/// \brief Initialize histogram counters to zero.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [out] hist - histogram bin count.
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_INLINE
void init_histogram(Counter hist[Bins])
{
const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
ROCPRIM_UNROLL
for(unsigned int offset = 0; offset < Bins; offset += BlockSize)
{
const unsigned int offset_tid = offset + flat_tid;
if(offset_tid < Bins)
{
hist[offset_tid] = Counter();
}
}
}
/// \brief Update an existing block-wide histogram. Each thread composites an array of
/// input elements.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples histogram operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_histogram for int, logical block of 192 threads,
/// // 2 items per thread and a bin size of 192.
/// using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
/// // allocate storage in shared memory
/// __shared__ block_histogram_int::storage_type storage;
/// __shared__ int hist[192];
///
/// int value[2];
/// ...
/// // initialize histogram
/// block_histogram_int().init_histogram(
/// hist // output
/// );
///
/// rocprim::syncthreads();
///
/// // update histogram
/// block_histogram_int().composite(
/// value, // input
/// hist, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_INLINE
void composite(T (&input)[ItemsPerThread],
Counter hist[Bins],
storage_type& storage)
{
base_type::composite(input, hist, storage);
}
/// \overload
/// \brief Update an existing block-wide histogram. Each thread composites an array of
/// input elements.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void composite(T (&input)[ItemsPerThread],
Counter hist[Bins])
{
base_type::composite(input, hist);
}
/// \brief Construct a new block-wide histogram. Each thread contributes an array of
/// input elements.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
/// \param [in] storage - reference to a temporary storage object of type storage_type.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Examples
/// \parblock
/// In the examples histogram operation is performed on block of 192 threads, each provides
/// one \p int value, result is returned using the same variable as for input.
///
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// // specialize block_histogram for int, logical block of 192 threads,
/// // 2 items per thread and a bin size of 192.
/// using block_histogram_int = rocprim::block_histogram<int, 192, 2, 192>;
/// // allocate storage in shared memory
/// __shared__ block_histogram_int::storage_type storage;
/// __shared__ int hist[192];
///
/// int value[2];
/// ...
/// // execute histogram
/// block_histogram_int().histogram(
/// value, // input
/// hist, // output
/// storage
/// );
/// ...
/// }
/// \endcode
/// \endparblock
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_INLINE
void histogram(T (&input)[ItemsPerThread],
Counter hist[Bins],
storage_type& storage)
{
init_histogram(hist);
::rocprim::syncthreads();
composite(input, hist, storage);
}
/// \overload
/// \brief Construct a new block-wide histogram. Each thread contributes an array of
/// input elements.
///
/// * This overload does not accept storage argument. Required shared memory is
/// allocated by the method itself.
///
/// \tparam Counter - [inferred] counter type of histogram.
///
/// \param [in] input - reference to an array containing thread input values.
/// \param [out] hist - histogram bin count.
template<class Counter>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void histogram(T (&input)[ItemsPerThread],
Counter hist[Bins])
{
init_histogram(hist);
::rocprim::syncthreads();
composite(input, hist);
}
};
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_HISTOGRAM_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
#define ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
#include "block_load_func.hpp"
#include "block_exchange.hpp"
/// \addtogroup blockmodule
/// @{
BEGIN_ROCPRIM_NAMESPACE
/// \brief \p block_load_method enumerates the methods available to load data
/// from continuous memory into a blocked arrangement of items across the thread block
enum class block_load_method
{
/// Data from continuous memory is loaded into a blocked arrangement of items.
/// \par Performance Notes:
/// * Performance decreases with increasing number of items per thread (stride
/// between reads), because of reduced memory coalescing.
block_load_direct,
/// A striped arrangement of data is read directly from memory.
block_load_striped,
/// Data from continuous memory is loaded into a blocked arrangement of items
/// using vectorization as an optimization.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, provided that
/// vectorization requirements are fulfilled. Otherwise, performance will default
/// to \p block_load_direct.
/// \par Requirements:
/// * The input offset (\p block_input) must be quad-item aligned.
/// * The following conditions will prevent vectorization and switch to default
/// \p block_load_direct:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
block_load_vectorize,
/// A striped arrangement of data from continuous memory is locally transposed
/// into a blocked arrangement of items.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_load_direct and
/// \p block_load_vectorize due to reordering on local memory.
block_load_transpose,
/// A warp-striped arrangement of data from continuous memory is locally transposed
/// into a blocked arrangement of items.
/// \par Requirements:
/// * The number of threads in the block must be a multiple of the size of hardware warp.
/// \par Performance Notes:
/// * Performance remains high due to increased memory coalescing, regardless of the
/// number of items per thread.
/// * Performance may be better compared to \p block_load_direct and
/// \p block_load_vectorize due to reordering on local memory.
block_load_warp_transpose,
/// Defaults to \p block_load_direct
default_method = block_load_direct
};
/// \brief The \p block_load class is a block level parallel primitive which provides methods
/// for loading data from continuous memory into a blocked arrangement of items across the thread
/// block.
///
/// \tparam T - the input/output type.
/// \tparam BlockSize - the number of threads in a block.
/// \tparam ItemsPerThread - the number of items to be processed by
/// each thread.
/// \tparam Method - the method to load data.
///
/// \par Overview
/// * The \p block_load class has a number of different methods to load data:
/// * [block_load_direct](\ref ::block_load_method::block_load_direct)
/// * [block_load_striped](\ref ::block_load_method::block_load_striped)
/// * [block_load_vectorize](\ref ::block_load_method::block_load_vectorize)
/// * [block_load_transpose](\ref ::block_load_method::block_load_transpose)
/// * [block_load_warp_transpose](\ref ::block_load_method::block_load_warp_transpose)
///
/// \par Example:
/// \parblock
/// In the examples load operation is performed on block of 128 threads, using type
/// \p int and 8 items per thread.
///
/// \code{.cpp}
/// __global__ void example_kernel(int * input, ...)
/// {
/// const int offset = blockIdx.x * 128 * 8;
/// int items[8];
/// rocprim::block_load<int, 128, 8, load_method> blockload;
/// blockload.load(input + offset, items);
/// ...
/// }
/// \endcode
/// \endparblock
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
block_load_method Method = block_load_method::block_load_direct,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1
>
class block_load
{
private:
using storage_type_ = typename ::rocprim::detail::empty_storage_type;
public:
/// \brief Struct used to allocate a temporary memory that is required for thread
/// communication during operations provided by related parallel primitive.
///
/// Depending on the implemention the operations exposed by parallel primitive may
/// require a temporary storage for thread communication. The storage should be allocated
/// using keywords \p __shared__. It can be aliased to
/// an externally allocated memory, or be a part of a union with other storage types
/// to increase shared memory reusability.
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = typename ::rocprim::detail::empty_storage_type;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range \p valid.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items, valid);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range with a fall-back value for out-of-bound
/// elements.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
/// \tparam Default - [inferred] The data type of the default value.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
/// \param [in] out_of_bounds - default value assigned to out-of-bound items.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items, valid,
out_of_bounds);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, using temporary storage.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] storage - temporary storage for inputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// int items[8];
/// using block_load_int = rocprim::block_load<int, 128, 8>;
/// block_load_int bload;
/// __shared__ typename block_load_int::storage_type storage;
/// bload.load(..., items, storage);
/// ...
/// }
/// \endcode
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range \p valid, using temporary storage.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
/// \param [in] storage - temporary storage for inputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// int items[8];
/// using block_load_int = rocprim::block_load<int, 128, 8>;
/// block_load_int bload;
/// tile_static typename block_load_int::storage_type storage;
/// bload.load(..., items, valid, storage);
/// ...
/// }
/// \endcode
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items, valid);
}
/// \brief Loads data from continuous memory into an arrangement of items across the
/// thread block, which is guarded by range with a fall-back value for out-of-bound
/// elements, using temporary storage.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer.
/// \tparam Default - [inferred] The data type of the default value.
///
/// \param [in] block_input - the input iterator from the thread block to load from.
/// \param [out] items - array that data is loaded to.
/// \param [in] valid - maximum range of valid numbers to load.
/// \param [in] out_of_bounds - default value assigned to out-of-bound items.
/// \param [in] storage - temporary storage for inputs.
///
/// \par Overview
/// * The type \p T must be such that an object of type \p InputIterator
/// can be dereferenced and then implicitly converted to \p T.
///
/// \par Storage reusage
/// Synchronization barrier should be placed before \p storage is reused
/// or repurposed: \p __syncthreads() or \p rocprim::syncthreads().
///
/// \par Example.
/// \code{.cpp}
/// __global__ void example_kernel(...)
/// {
/// int items[8];
/// using block_load_int = rocprim::block_load<int, 128, 8>;
/// block_load_int bload;
/// __shared__ typename block_load_int::storage_type storage;
/// bload.load(..., items, valid, out_of_bounds, storage);
/// ...
/// }
/// \endcode
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items, valid, out_of_bounds);
}
};
/// @}
// end of group blockmodule
#ifndef DOXYGEN_SHOULD_SKIP_THIS
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_striped, BlockSizeY, BlockSizeZ>
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
private:
using storage_type_ = typename ::rocprim::detail::empty_storage_type;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = typename ::rocprim::detail::empty_storage_type;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
template<class InputIterator>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items);
}
template<class InputIterator>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
out_of_bounds);
}
template<class InputIterator>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items);
}
template<class InputIterator>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE inline
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
out_of_bounds);
}
};
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_vectorize, BlockSizeY, BlockSizeZ>
{
private:
using storage_type_ = typename ::rocprim::detail::empty_storage_type;
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
using storage_type = typename ::rocprim::detail::empty_storage_type;
#else
using storage_type = storage_type_; // only for Doxygen
#endif
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(T* block_input,
T (&_items)[ItemsPerThread])
{
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked_vectorized(flat_id, block_input, _items);
}
template<class InputIterator, class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
U (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items, valid);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_blocked(flat_id, block_input, items, valid,
out_of_bounds);
}
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(T* block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
(void) storage;
load(block_input, items);
}
template<class InputIterator, class U>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
U (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items, valid);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
(void) storage;
load(block_input, items, valid, out_of_bounds);
}
};
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_transpose, BlockSizeY, BlockSizeZ>
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
private:
using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;
public:
using storage_type = typename block_exchange_type::storage_type;
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
out_of_bounds);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
block_exchange_type().striped_to_blocked(items, items, storage);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
out_of_bounds);
block_exchange_type().striped_to_blocked(items, items, storage);
}
};
template<
class T,
unsigned int BlockSizeX,
unsigned int ItemsPerThread,
unsigned int BlockSizeY,
unsigned int BlockSizeZ
>
class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_warp_transpose, BlockSizeY, BlockSizeZ>
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
private:
using block_exchange_type = block_exchange<T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
public:
static_assert(BlockSize % ::rocprim::device_warp_size() == 0,
"BlockSize must be a multiple of hardware warpsize");
using storage_type = typename block_exchange_type::storage_type;
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread])
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items, valid);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
ROCPRIM_SHARED_MEMORY storage_type storage;
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items, valid,
out_of_bounds);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<class InputIterator>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items, valid);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
template<
class InputIterator,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void load(InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds,
storage_type& storage)
{
using value_type = typename std::iterator_traits<InputIterator>::value_type;
static_assert(std::is_convertible<value_type, T>::value,
"The type T must be such that an object of type InputIterator "
"can be dereferenced and then implicitly converted to T.");
const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
block_load_direct_warp_striped(flat_id, block_input, items, valid,
out_of_bounds);
block_exchange_type().warp_striped_to_blocked(items, items, storage);
}
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
END_ROCPRIM_NAMESPACE
#endif // ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
#define ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
#include "../config.hpp"
#include "../detail/various.hpp"
#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../types.hpp"
BEGIN_ROCPRIM_NAMESPACE
/// \addtogroup blockmodule
/// @{
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template<
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_blocked(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread])
{
unsigned int offset = flat_id * ItemsPerThread;
InputIterator thread_iter = block_input + offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = thread_iter[item];
}
}
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block, which is guarded by range \p valid.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
template<
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_blocked(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
unsigned int offset = flat_id * ItemsPerThread;
InputIterator thread_iter = block_input + offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
if (item + offset < valid)
{
items[item] = thread_iter[item];
}
}
}
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block, which is guarded by range with a fall-back value
/// for out-of-bound elements.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
/// \tparam Default - [inferred] The data type of the default value
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
/// \param out_of_bounds - default value assigned to out-of-bound items
template<
class InputIterator,
class T,
unsigned int ItemsPerThread,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_blocked(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = static_cast<T>(out_of_bounds);
}
// TODO: Consider using std::fill for HIP-CPU, as uses memset() where appropriate
block_load_direct_blocked(flat_id, block_input, items, valid);
}
/// \brief Loads data from continuous memory into a blocked arrangement of items
/// across the thread block.
///
/// The block arrangement is assumed to be (block-threads * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// The input offset (\p block_input + offset) must be quad-item aligned.
///
/// The following conditions will prevent vectorization and switch to default
/// block_load_direct_blocked:
/// * \p ItemsPerThread is odd.
/// * The datatype \p T is not a primitive or a HIP vector type (e.g. int2,
/// int4, etc.
///
/// \tparam T - [inferred] the input data type
/// \tparam U - [inferred] the output data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// The type \p T must be such that it can be implicitly converted to \p U.
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template<
class T,
class U,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto
block_load_direct_blocked_vectorized(unsigned int flat_id,
T* block_input,
U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type
{
typedef typename detail::match_vector_type<T, ItemsPerThread>::type vector_type;
constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type);
vector_type vector_items[vectors_per_thread];
const vector_type* vector_ptr = reinterpret_cast<const vector_type*>(block_input) +
(flat_id * vectors_per_thread);
ROCPRIM_UNROLL
for (unsigned int item = 0; item < vectors_per_thread; item++)
{
vector_items[item] = *(vector_ptr + item);
}
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = *(reinterpret_cast<T*>(vector_items) + item);
}
}
template<
class T,
class U,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
auto
block_load_direct_blocked_vectorized(unsigned int flat_id,
T* block_input,
U (&items)[ItemsPerThread]) -> typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>::value>::type
{
block_load_direct_blocked(flat_id, block_input, items);
}
/// \brief Loads data from continuous memory into a striped arrangement of items
/// across the thread block.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template<
unsigned int BlockSize,
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread])
{
InputIterator thread_iter = block_input + flat_id;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = thread_iter[item * BlockSize];
}
}
/// \brief Loads data from continuous memory into a striped arrangement of items
/// across the thread block, which is guarded by range \p valid.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
template<
unsigned int BlockSize,
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
InputIterator thread_iter = block_input + flat_id;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
unsigned int offset = item * BlockSize;
if (flat_id + offset < valid)
{
items[item] = thread_iter[offset];
}
}
}
/// \brief Loads data from continuous memory into a striped arrangement of items
/// across the thread block, which is guarded by range with a fall-back value
/// for out-of-bound elements.
///
/// The striped arrangement is assumed to be (\p BlockSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// \tparam BlockSize - the number of threads in a block
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
/// \tparam Default - [inferred] The data type of the default value
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
/// \param out_of_bounds - default value assigned to out-of-bound items
template<
unsigned int BlockSize,
class InputIterator,
class T,
unsigned int ItemsPerThread,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = out_of_bounds;
}
block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
}
/// \brief Loads data from continuous memory into a warp-striped arrangement of items
/// across the thread block.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
template<
unsigned int WarpSize = device_warp_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_warp_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread])
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
unsigned int thread_id = detail::logical_lane_id<WarpSize>();
unsigned int warp_id = flat_id / WarpSize;
unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
InputIterator thread_iter = block_input + thread_id + warp_offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = thread_iter[item * WarpSize];
}
}
/// \brief Loads data from continuous memory into a warp-striped arrangement of items
/// across the thread block, which is guarded by range \p valid.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
template<
unsigned int WarpSize = device_warp_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_warp_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
unsigned int thread_id = detail::logical_lane_id<WarpSize>();
unsigned int warp_id = flat_id / WarpSize;
unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
InputIterator thread_iter = block_input + thread_id + warp_offset;
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
unsigned int offset = item * WarpSize;
if (warp_offset + thread_id + offset < valid)
{
items[item] = thread_iter[offset];
}
}
}
/// \brief Loads data from continuous memory into a warp-striped arrangement of items
/// across the thread block, which is guarded by range with a fall-back value
/// for out-of-bound elements.
///
/// The warp-striped arrangement is assumed to be (\p WarpSize * \p ItemsPerThread) items
/// across a thread block. Each thread uses a \p flat_id to load a range of
/// \p ItemsPerThread into \p items.
///
/// * The number of threads in the block must be a multiple of \p WarpSize.
/// * The default \p WarpSize is a hardware warpsize and is an optimal value.
/// * \p WarpSize must be a power of two and equal or less than the size of
/// hardware warp.
/// * Using \p WarpSize smaller than hardware warpsize could result in lower
/// performance.
///
/// \tparam WarpSize - [optional] the number of threads in a warp
/// \tparam InputIterator - [inferred] an iterator type for input (can be a simple
/// pointer
/// \tparam T - [inferred] the data type
/// \tparam ItemsPerThread - [inferred] the number of items to be processed by
/// each thread
/// \tparam Default - [inferred] The data type of the default value
///
/// \param flat_id - a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input - the input iterator from the thread block to load from
/// \param items - array that data is loaded to
/// \param valid - maximum range of valid numbers to load
/// \param out_of_bounds - default value assigned to out-of-bound items
template<
unsigned int WarpSize = device_warp_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread,
class Default
>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_warp_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
items[item] = out_of_bounds;
}
block_load_direct_warp_striped<WarpSize>(flat_id, block_input, items, valid);
}
END_ROCPRIM_NAMESPACE
/// @}
// end of group blockmodule
#endif // ROCPRIM_BLOCK_BLOCK_LOAD_FUNC_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment