Commit a117adf8 authored by lishen's avatar lishen
Browse files
parents b33659dd b705eeca
[submodule "third-party/rocshmem"]
path = third-party/rocshmem
url = http://112.11.119.99:10068/dcutoolkit/deeplearing/rocshmem.git
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_HPP
#include <hip/hip_runtime.h>
#include "rocshmem_config.h"
#include "rocshmem_common.hpp"
#include "rocshmem_RMA.hpp"
#include "rocshmem_AMO.hpp"
#include "rocshmem_SIG_OP.hpp"
#include "rocshmem_COLL.hpp"
#include "rocshmem_P2P_SYNC.hpp"
#include "rocshmem_RMA_X.hpp"
#if defined(HAVE_EXTERNAL_MPI)
#include <mpi.h>
#endif
/**
* @file rocshmem.hpp
* @brief Public header for rocSHMEM device and host libraries.
*
* This file contains all the callable functions and data structures for both
* the device-side runtime and host-side runtime.
*
* The comments on these functions are sparse, but the semantics are the same
* as those implemented in OpenSHMEM unless otherwise documented. Please see
* the OpenSHMEM 1.4 standards documentation for more details:
*
* http://openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf
*/
namespace rocshmem {
constexpr char VERSION[] = "3.0.0";
/******************************************************************************
**************************** HOST INTERFACE **********************************
*****************************************************************************/
#if defined(HAVE_EXTERNAL_MPI)
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer.
*
* @param[in] comm MPI Communicator that rocSHMEM will be using
* If MPI_COMM_NULL, rocSHMEM will be using MPI_COMM_WORLD
*/
[[deprecated]] __host__ void rocshmem_init(MPI_Comm comm);
#endif
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer.
* This is equivalent to the previous function, using implicitely
* MPI_COMM_WORLD for initialization
*/
__host__ void rocshmem_init(void);
/**
* @brief Query rocSHMEM context from host API
*
* @param[out] ctx Returns ROCSHMEM_CTX_DEFAULT device pointer that users
* can query from one instance of rocshmem host library and
* use use later for dynamic module initialization in
* kernel bitcode device library in the same application
*/
__host__ void * rocshmem_get_device_ctx();
/**
* @brief Query rocSHMEM remote symmetric heap pointer
*
* @param[in] dest local symmetric heap allocation pointer for current pe/device
*
* @param[in] pe remote PE
*
* @param[out] ptr Returns remote symmetric heap device pointer from host-side API.
* This can be used to issue load/store from custom kernels
* instead of using rocshmem device side get/put APIs for RMA operations.
*/
__host__ void* rocshmem_ptr(const void *dest, int pe);
__device__ ATTR_NO_INLINE void* rocshmem_ptr(const void *dest, int pe);
#if defined(HAVE_EXTERNAL_MPI)
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer
* with an attempt to enable the requested thread support.
*
* @param[in] requested Requested thread mode (from rocshmem_thread_ops)
* for host-facing functions.
* @param[out] provided Thread mode selected by the runtime. May not be equal
* to requested thread mode.
* @param[in] comm (Optional) MPI Communicator that rocSHMEM will be using
* If MPI_COMM_NULL, rocSHMEM will be using MPI_COMM_WORLD
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
[[deprecated]] __host__ int rocshmem_init_thread(int requested, int *provided,
MPI_Comm comm);
#endif
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer
* using the provided mode and attributes
*
* @param[in] flags initialization method to be used.
* Valid values are ROCSHMEM_INIT_WITH_UNIQUEID and
* ROCSHMEM_INIT_WITH_MPI_COMM
* @param[in] attr attribute structure specifying input characteristics
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
__host__ int rocshmem_init_attr(unsigned int flags, rocshmem_init_attr_t *attr);
/**
* @brief Return a uniqueID
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
__host__ int rocshmem_get_uniqueid(rocshmem_uniqueid_t *uid);
/**
* @brief Initalizes the rocshmem_init_attr_t struct
*
* @param[in] rank rank of the calling process
* @param[in] nranks number of pes
* @param[in] uid unique ID used to identify the group processes.
* All processes that
* @param[out] attr attribute structure to be passed to rocshmem_init_attr
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
__host__ int rocshmem_set_attr_uniqueid_args(int rank, int nranks,
rocshmem_uniqueid_t *uid,
rocshmem_init_attr_t *attr);
/**
* @brief Query the thread mode used by the runtime.
*
* @param[out] provided Thread mode the runtime is operating in.
*
* @return void.
*/
__host__ void rocshmem_query_thread(int *provided);
/**
* @brief Function that dumps internal stats to stdout.
*/
__host__ void rocshmem_dump_stats();
/**
* @brief Reset all internal stats.
*/
__host__ void rocshmem_reset_stats();
/**
* @brief Finalize the rocSHMEM runtime.
*/
__host__ void rocshmem_finalize();
/**
* @brief Allocate memory of \p size bytes from the symmetric heap.
* This is a collective operation and must be called by all PEs.
*
* @param[in] size Memory allocation size in bytes.
*
* @return A pointer to the allocated memory on the symmetric heap.
*
* @todo Return error code instead of ptr.
*/
__host__ void *rocshmem_malloc(size_t size);
/**
* @brief Free a memory allocation from the symmetric heap.
* This is a collective operation and must be called by all PEs.
*
* @param[in] ptr Pointer to previously allocated memory on the symmetric heap.
*/
__host__ void rocshmem_free(void *ptr);
/**
* @brief Query for the number of PEs.
*
* @return Number of PEs.
*/
__host__ int rocshmem_n_pes();
/**
* @brief Query the PE ID of the caller.
*
* @return PE ID of the caller.
*/
__host__ int rocshmem_my_pe();
/**
* @brief Creates an OpenSHMEM context.
*
* @param[in] options Options for context creation. Ignored in current design.
* @param[out] ctx Context handle.
*
* @return Zero on success and nonzero otherwise.
*/
__host__ int rocshmem_ctx_create(int64_t options, rocshmem_ctx_t *ctx);
/**
* @brief Destroys an OpenSHMEM context.
*
* @param[out] ctx Context handle.
*
* @return void.
*/
__host__ void rocshmem_ctx_destroy(rocshmem_ctx_t ctx);
/**
* @brief Translate the PE in src_team to that in dest_team.
*
* @param[in] src_team Handle of the team from which to translate
* @param[in] src_pe PE-of-interest's index in src_team
* @param[in] dest_team Handle of the team to which to translate
*
* @return PE of src_pe in dest_team. If any input is invalid
* or if src_pe is not in both source and destination
* teams, a value of -1 is returned.
*/
__host__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, int src_pe,
rocshmem_team_t dest_team);
/**
* @brief Query the number of PEs in a team.
*
* @param[in] team The team to query PE ID in.
*
* @return Number of PEs in the provided team.
*/
__host__ int rocshmem_team_n_pes(rocshmem_team_t team);
/**
* @brief Query the PE ID of the caller in a team.
*
* @param[in] team The team to query PE ID in.
*
* @return PE ID of the caller in the provided team.
*/
__host__ int rocshmem_team_my_pe(rocshmem_team_t team);
/**
* @brief Create a new a team of PEs. Must be called by all PEs
* in the parent team.
*
* @param[in] parent_team The team to split from.
* @param[in] start The lowest PE number of the subset of the PEs
* from the parent team that will form the new
* team.
* @param[in] stide The stride between team PE members in the
* parent team that comprise the subset of PEs
* that will form the new team.
* @param[in] size The number of PEs in the new team.
* @param[in] config Pointer to the config parameters for the new
* team.
* @param[in] config_mask Bitwise mask representing parameters to use
* from config
* @param[out] new_team Pointer to the newly created team. If an error
* occurs during team creation, or if the PE in
* the parent team is not in the new team, the
* value will be ROCSHMEM_TEAM_INVALID.
*
* @return Zero upon successful team creation; non-zero if erroneous.
*/
__host__ int rocshmem_team_split_strided(rocshmem_team_t parent_team,
int start, int stride, int size,
const rocshmem_team_config_t *config,
long config_mask,
rocshmem_team_t *new_team);
/**
* @brief Destroy a team. Must be called by all PEs in the team.
* The user must destroy all private contexts created in the
* team before destroying this team. Otherwise, the behavior
* is undefined. This call will destroy only the shareable contexts
* created from the referenced team.
*
* @param[in] team The team to destroy. The behavior is undefined if
* the input team is ROCSHMEM_TEAM_WORLD or any other
* invalid team. If the input is ROCSHMEM_TEAM_INVALID,
* this function will not perform any operation.
*
* @return None.
*/
__host__ void rocshmem_team_destroy(rocshmem_team_t team);
/**
* @brief Guarantees order between messages in this context in accordance with
* OpenSHMEM semantics.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__host__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx);
__host__ void rocshmem_fence();
/**
* @brief Completes all previous operations posted on the host.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__host__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx);
__host__ void rocshmem_quiet();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* @return void
*/
__host__ void rocshmem_barrier_all();
/**
* @brief enqueues a collective barrier on given stream.
*
* @return void
*/
__host__ void rocshmem_barrier_all_on_stream(hipStream_t stream);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* @return void
*/
__host__ void rocshmem_sync_all();
/**
* @brief allows any PE to force the termination of an entire program.
*
* @param[in] status The exit status from the main program.
*
* @return void
*/
__host__ void rocshmem_global_exit(int status);
/******************************************************************************
**************************** DEVICE INTERFACE ********************************
*****************************************************************************/
/**
* @brief Initializes device-side rocSHMEM resources. Must be called before
* any threads in this work-group invoke other rocSHMEM functions.
*
* Must be called collectively by all threads in the work-group.
*
* @return void.
*/
[[deprecated]] __device__ void rocshmem_wg_init();
/**
* @brief Finalizes device-side rocSHMEM resources. Must be called before
* work-group completion if the work-group also called rocshmem_wg_init().
*
* Must be called collectively by all threads in the work-group.
*
* @return void.
*/
[[deprecated]] __device__ void rocshmem_wg_finalize();
/**
* @brief Initializes device-side rocSHMEM resources. Must be called before
* any threads in this work-group invoke other rocSHMEM functions. This is
* a variant of rocshmem_wg_init that allows the caller to request a
* threading mode.
*
* @param[in] requested Requested thread mode from rocshmem_thread_ops.
* @param[out] provided Thread mode selected by the runtime. May not be equal
* to requested thread mode.
*
* Must be called collectively by all threads in the work-group.
*
* @return void.
*/
[[deprecated]] __device__ void rocshmem_wg_init_thread(int requested, int *provided);
/**
* @brief Query the thread mode used by the runtime.
*
* @param[out] provided Thread mode the runtime is operating in.
*
* @return void.
*/
__device__ void rocshmem_query_thread(int *provided);
/**
* @brief Creates an OpenSHMEM context. By design, the context is private
* to the calling work-group.
*
* Must be called collectively by all threads in the work-group.
*
* @param[in] options Options for context creation. Ignored in current design.
* @param[out] ctx Context handle.
*
* @return All threads returns 0 if the context was created successfully. If any
* thread returns non-zero value, the operation failed and a higher number of
* `ROCSHMEM_MAX_NUM_CONTEXTS` is required.
*/
__device__ ATTR_NO_INLINE int rocshmem_wg_ctx_create(int64_t options,
rocshmem_ctx_t *ctx);
__device__ ATTR_NO_INLINE int rocshmem_wg_team_create_ctx(
rocshmem_team_t team, long options, rocshmem_ctx_t *ctx);
/**
* @brief Destroys an OpenSHMEM context.
*
* Must be called collectively by all threads in the work-group.
*
* @param[in] The context to destroy.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_wg_ctx_destroy(rocshmem_ctx_t *ctx);
/**
* @brief Guarantees order between messages in this context in accordance with
* OpenSHMEM semantics.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* rocSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_fence(rocshmem_ctx_t ctx);
__device__ ATTR_NO_INLINE void rocshmem_fence();
/**
* @brief Guarantees order between messages in this context in accordance with
* OpenSHMEM semantics.
*
* This function is an extension as it is per PE. has same semantics as default
* API but it is per PE
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] pe destination pe.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_fence(rocshmem_ctx_t ctx, int pe);
__device__ ATTR_NO_INLINE void rocshmem_fence(int pe);
/**
* @brief Completes all previous operations posted to this context.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* rocSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_quiet(rocshmem_ctx_t ctx);
__device__ ATTR_NO_INLINE void rocshmem_quiet();
/**
* @brief Completes all previous operations posted to this context for PEs in the
* `target_pes` array.
*
* @param[in] ctx Context with which to perform this operation.
*
* @param[in] target_pes Address of target PE array where the operations need to be completed.
*
* @param[in] npes The number of PEs in the target PE array.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_pe_quiet(rocshmem_ctx_t ctx, const int *target_pes, size_t npes);
__device__ ATTR_NO_INLINE void rocshmem_pe_quiet(const int *target_pes, size_t npes);
/**
* @brief Query the total number of PEs.
*
* Can be called per thread with no performance penalty.
*
* @param[in] ctx GPU side handle.
*
* @return Total number of PEs.
*/
__device__ int rocshmem_ctx_n_pes(rocshmem_ctx_t ctx);
__device__ int rocshmem_n_pes();
/**
* @brief Query the PE ID of the caller.
*
* Can be called per thread with no performance penalty.
*
* @param[in] ctx GPU side handle
*
* @return PE ID of the caller.
*/
__device__ int rocshmem_ctx_my_pe(rocshmem_ctx_t ctx);
__device__ int rocshmem_my_pe();
/**
* @brief Translate the PE in src_team to that in dest_team.
*
* @param[in] src_team Handle of the team from which to translate
* @param[in] src_pe PE-of-interest's index in src_team
* @param[in] dest_team Handle of the team to which to translate
*
* @return PE of src_pe in dest_team. If any input is invalid
* or if src_pe is not in both source and destination
* teams, a value of -1 is returned.
*/
__device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
int src_pe,
rocshmem_team_t dest_team);
__device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system(
rocshmem_ctx_t ctx);
__device__ ATTR_NO_INLINE void rocshmem_threadfence_system();
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_HPP
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
namespace rocshmem {
/**
* @name SHMEM_ATOMIC_FETCH
* @brief Atomically return the value of \p dest to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return The value of \p dest.
*/
__device__ ATTR_NO_INLINE float rocshmem_ctx_float_atomic_fetch(
rocshmem_ctx_t ctx, float *source, int pe);
__device__ ATTR_NO_INLINE float rocshmem_float_atomic_fetch(
float *source, int pe);
__host__ float rocshmem_ctx_float_atomic_fetch(
rocshmem_ctx_t ctx, float *source, int pe);
__host__ float rocshmem_float_atomic_fetch(
float *source, int pe);
__device__ ATTR_NO_INLINE double rocshmem_ctx_double_atomic_fetch(
rocshmem_ctx_t ctx, double *source, int pe);
__device__ ATTR_NO_INLINE double rocshmem_double_atomic_fetch(
double *source, int pe);
__host__ double rocshmem_ctx_double_atomic_fetch(
rocshmem_ctx_t ctx, double *source, int pe);
__host__ double rocshmem_double_atomic_fetch(
double *source, int pe);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_fetch(
rocshmem_ctx_t ctx, int *source, int pe);
__device__ ATTR_NO_INLINE int rocshmem_int_atomic_fetch(
int *source, int pe);
__host__ int rocshmem_ctx_int_atomic_fetch(
rocshmem_ctx_t ctx, int *source, int pe);
__host__ int rocshmem_int_atomic_fetch(
int *source, int pe);
__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_fetch(
rocshmem_ctx_t ctx, long *source, int pe);
__device__ ATTR_NO_INLINE long rocshmem_long_atomic_fetch(
long *source, int pe);
__host__ long rocshmem_ctx_long_atomic_fetch(
rocshmem_ctx_t ctx, long *source, int pe);
__host__ long rocshmem_long_atomic_fetch(
long *source, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_fetch(
rocshmem_ctx_t ctx, long long *source, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_fetch(
long long *source, int pe);
__host__ long long rocshmem_ctx_longlong_atomic_fetch(
rocshmem_ctx_t ctx, long long *source, int pe);
__host__ long long rocshmem_longlong_atomic_fetch(
long long *source, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch(
rocshmem_ctx_t ctx, unsigned int *source, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch(
unsigned int *source, int pe);
__host__ unsigned int rocshmem_ctx_uint_atomic_fetch(
rocshmem_ctx_t ctx, unsigned int *source, int pe);
__host__ unsigned int rocshmem_uint_atomic_fetch(
unsigned int *source, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch(
rocshmem_ctx_t ctx, unsigned long *source, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch(
unsigned long *source, int pe);
__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch(
rocshmem_ctx_t ctx, unsigned long *source, int pe);
__host__ unsigned long rocshmem_ulong_atomic_fetch(
unsigned long *source, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch(
rocshmem_ctx_t ctx, unsigned long long *source, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch(
unsigned long long *source, int pe);
__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch(
rocshmem_ctx_t ctx, unsigned long long *source, int pe);
__host__ unsigned long long rocshmem_ulonglong_atomic_fetch(
unsigned long long *source, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch(
rocshmem_ctx_t ctx, int32_t *source, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch(
int32_t *source, int pe);
__host__ int32_t rocshmem_ctx_int32_atomic_fetch(
rocshmem_ctx_t ctx, int32_t *source, int pe);
__host__ int32_t rocshmem_int32_atomic_fetch(
int32_t *source, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch(
rocshmem_ctx_t ctx, int64_t *source, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch(
int64_t *source, int pe);
__host__ int64_t rocshmem_ctx_int64_atomic_fetch(
rocshmem_ctx_t ctx, int64_t *source, int pe);
__host__ int64_t rocshmem_int64_atomic_fetch(
int64_t *source, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch(
rocshmem_ctx_t ctx, uint32_t *source, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch(
uint32_t *source, int pe);
__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch(
rocshmem_ctx_t ctx, uint32_t *source, int pe);
__host__ uint32_t rocshmem_uint32_atomic_fetch(
uint32_t *source, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch(
rocshmem_ctx_t ctx, uint64_t *source, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch(
uint64_t *source, int pe);
__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch(
rocshmem_ctx_t ctx, uint64_t *source, int pe);
__host__ uint64_t rocshmem_uint64_atomic_fetch(
uint64_t *source, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_fetch(
rocshmem_ctx_t ctx, size_t *source, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_fetch(
size_t *source, int pe);
__host__ size_t rocshmem_ctx_size_atomic_fetch(
rocshmem_ctx_t ctx, size_t *source, int pe);
__host__ size_t rocshmem_size_atomic_fetch(
size_t *source, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch(
rocshmem_ctx_t ctx, ptrdiff_t *source, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_fetch(
ptrdiff_t *source, int pe);
__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch(
rocshmem_ctx_t ctx, ptrdiff_t *source, int pe);
__host__ ptrdiff_t rocshmem_ptrdiff_atomic_fetch(
ptrdiff_t *source, int pe);
/**
* @name SHMEM_ATOMIC_SET
* @brief Atomically set the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_atomic_set(
rocshmem_ctx_t ctx, float *dest, float value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_atomic_set(
float *dest, float value, int pe);
__host__ void rocshmem_ctx_float_atomic_set(
rocshmem_ctx_t ctx, float *dest, float value, int pe);
__host__ void rocshmem_float_atomic_set(
float *dest, float value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_atomic_set(
rocshmem_ctx_t ctx, double *dest, double value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_atomic_set(
double *dest, double value, int pe);
__host__ void rocshmem_ctx_double_atomic_set(
rocshmem_ctx_t ctx, double *dest, double value, int pe);
__host__ void rocshmem_double_atomic_set(
double *dest, double value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_atomic_set(
rocshmem_ctx_t ctx, int *dest, int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_atomic_set(
int *dest, int value, int pe);
__host__ void rocshmem_ctx_int_atomic_set(
rocshmem_ctx_t ctx, int *dest, int value, int pe);
__host__ void rocshmem_int_atomic_set(
int *dest, int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_atomic_set(
rocshmem_ctx_t ctx, long *dest, long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_atomic_set(
long *dest, long value, int pe);
__host__ void rocshmem_ctx_long_atomic_set(
rocshmem_ctx_t ctx, long *dest, long value, int pe);
__host__ void rocshmem_long_atomic_set(
long *dest, long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_atomic_set(
rocshmem_ctx_t ctx, long long *dest, long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_atomic_set(
long long *dest, long long value, int pe);
__host__ void rocshmem_ctx_longlong_atomic_set(
rocshmem_ctx_t ctx, long long *dest, long long value, int pe);
__host__ void rocshmem_longlong_atomic_set(
long long *dest, long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_set(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_set(
unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_ctx_uint_atomic_set(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_uint_atomic_set(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_set(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_set(
unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ctx_ulong_atomic_set(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ulong_atomic_set(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_set(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_set(
unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ctx_ulonglong_atomic_set(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ulonglong_atomic_set(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_set(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_set(
int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_ctx_int32_atomic_set(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_int32_atomic_set(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_set(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_set(
int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_ctx_int64_atomic_set(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_int64_atomic_set(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_set(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_set(
uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_ctx_uint32_atomic_set(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_uint32_atomic_set(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_set(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_set(
uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_ctx_uint64_atomic_set(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_uint64_atomic_set(
uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_size_atomic_set(
rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_size_atomic_set(
size_t *dest, size_t value, int pe);
__host__ void rocshmem_ctx_size_atomic_set(
rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe);
__host__ void rocshmem_size_atomic_set(
size_t *dest, size_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ptrdiff_atomic_set(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ptrdiff_atomic_set(
ptrdiff_t *dest, ptrdiff_t value, int pe);
__host__ void rocshmem_ctx_ptrdiff_atomic_set(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe);
__host__ void rocshmem_ptrdiff_atomic_set(
ptrdiff_t *dest, ptrdiff_t value, int pe);
/**
* @name SHMEM_ATOMIC_COMPARE_SWAP
* @brief Atomically compares if the value in \p dest with \p cond is equal
* then put \p val in \p dest. The operation returns the older value of \p dest
* to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] cond The value to be compare with.
* @param[in] val The value to be atomically swapped.
* @param[in] pe PE of the remote process.
*
* @return The old value of \p dest.
*/
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_compare_swap(
rocshmem_ctx_t ctx, int *dest, int cond, int value, int pe);
__device__ ATTR_NO_INLINE int rocshmem_int_atomic_compare_swap(
int *dest, int cond, int value, int pe);
__host__ int rocshmem_ctx_int_atomic_compare_swap(
rocshmem_ctx_t ctx, int *dest, int cond, int value, int pe);
__host__ int rocshmem_int_atomic_compare_swap(
int *dest, int cond, int value, int pe);
__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_compare_swap(
rocshmem_ctx_t ctx, long *dest, long cond, long value, int pe);
__device__ ATTR_NO_INLINE long rocshmem_long_atomic_compare_swap(
long *dest, long cond, long value, int pe);
__host__ long rocshmem_ctx_long_atomic_compare_swap(
rocshmem_ctx_t ctx, long *dest, long cond, long value, int pe);
__host__ long rocshmem_long_atomic_compare_swap(
long *dest, long cond, long value, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_compare_swap(
rocshmem_ctx_t ctx, long long *dest, long long cond, long long value, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_compare_swap(
long long *dest, long long cond, long long value, int pe);
__host__ long long rocshmem_ctx_longlong_atomic_compare_swap(
rocshmem_ctx_t ctx, long long *dest, long long cond, long long value, int pe);
__host__ long long rocshmem_longlong_atomic_compare_swap(
long long *dest, long long cond, long long value, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_compare_swap(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int cond, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_compare_swap(
unsigned int *dest, unsigned int cond, unsigned int value, int pe);
__host__ unsigned int rocshmem_ctx_uint_atomic_compare_swap(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int cond, unsigned int value, int pe);
__host__ unsigned int rocshmem_uint_atomic_compare_swap(
unsigned int *dest, unsigned int cond, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_compare_swap(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long cond, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_compare_swap(
unsigned long *dest, unsigned long cond, unsigned long value, int pe);
__host__ unsigned long rocshmem_ctx_ulong_atomic_compare_swap(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long cond, unsigned long value, int pe);
__host__ unsigned long rocshmem_ulong_atomic_compare_swap(
unsigned long *dest, unsigned long cond, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_compare_swap(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_compare_swap(
unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_compare_swap(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ulonglong_atomic_compare_swap(
unsigned long long *dest, unsigned long long cond, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_compare_swap(
rocshmem_ctx_t ctx, int32_t *dest, int32_t cond, int32_t value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_compare_swap(
int32_t *dest, int32_t cond, int32_t value, int pe);
__host__ int32_t rocshmem_ctx_int32_atomic_compare_swap(
rocshmem_ctx_t ctx, int32_t *dest, int32_t cond, int32_t value, int pe);
__host__ int32_t rocshmem_int32_atomic_compare_swap(
int32_t *dest, int32_t cond, int32_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_compare_swap(
rocshmem_ctx_t ctx, int64_t *dest, int64_t cond, int64_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_compare_swap(
int64_t *dest, int64_t cond, int64_t value, int pe);
__host__ int64_t rocshmem_ctx_int64_atomic_compare_swap(
rocshmem_ctx_t ctx, int64_t *dest, int64_t cond, int64_t value, int pe);
__host__ int64_t rocshmem_int64_atomic_compare_swap(
int64_t *dest, int64_t cond, int64_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_compare_swap(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t cond, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_compare_swap(
uint32_t *dest, uint32_t cond, uint32_t value, int pe);
__host__ uint32_t rocshmem_ctx_uint32_atomic_compare_swap(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t cond, uint32_t value, int pe);
__host__ uint32_t rocshmem_uint32_atomic_compare_swap(
uint32_t *dest, uint32_t cond, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_compare_swap(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t cond, uint64_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_compare_swap(
uint64_t *dest, uint64_t cond, uint64_t value, int pe);
__host__ uint64_t rocshmem_ctx_uint64_atomic_compare_swap(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t cond, uint64_t value, int pe);
__host__ uint64_t rocshmem_uint64_atomic_compare_swap(
uint64_t *dest, uint64_t cond, uint64_t value, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_compare_swap(
rocshmem_ctx_t ctx, size_t *dest, size_t cond, size_t value, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_compare_swap(
size_t *dest, size_t cond, size_t value, int pe);
__host__ size_t rocshmem_ctx_size_atomic_compare_swap(
rocshmem_ctx_t ctx, size_t *dest, size_t cond, size_t value, int pe);
__host__ size_t rocshmem_size_atomic_compare_swap(
size_t *dest, size_t cond, size_t value, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_compare_swap(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_compare_swap(
ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe);
__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_compare_swap(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe);
__host__ ptrdiff_t rocshmem_ptrdiff_atomic_compare_swap(
ptrdiff_t *dest, ptrdiff_t cond, ptrdiff_t value, int pe);
/**
* @name SHMEM_ATOMIC_SWAP
* @brief Atomically swap the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/
__device__ ATTR_NO_INLINE float rocshmem_ctx_float_atomic_swap(
rocshmem_ctx_t ctx, float *dest, float value, int pe);
__device__ ATTR_NO_INLINE float rocshmem_float_atomic_swap(
float *dest, float value, int pe);
__host__ float rocshmem_ctx_float_atomic_swap(
rocshmem_ctx_t ctx, float *dest, float value, int pe);
__host__ float rocshmem_float_atomic_swap(
float *dest, float value, int pe);
__device__ ATTR_NO_INLINE double rocshmem_ctx_double_atomic_swap(
rocshmem_ctx_t ctx, double *dest, double value, int pe);
__device__ ATTR_NO_INLINE double rocshmem_double_atomic_swap(
double *dest, double value, int pe);
__host__ double rocshmem_ctx_double_atomic_swap(
rocshmem_ctx_t ctx, double *dest, double value, int pe);
__host__ double rocshmem_double_atomic_swap(
double *dest, double value, int pe);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_swap(
rocshmem_ctx_t ctx, int *dest, int value, int pe);
__device__ ATTR_NO_INLINE int rocshmem_int_atomic_swap(
int *dest, int value, int pe);
__host__ int rocshmem_ctx_int_atomic_swap(
rocshmem_ctx_t ctx, int *dest, int value, int pe);
__host__ int rocshmem_int_atomic_swap(
int *dest, int value, int pe);
__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_swap(
rocshmem_ctx_t ctx, long *dest, long value, int pe);
__device__ ATTR_NO_INLINE long rocshmem_long_atomic_swap(
long *dest, long value, int pe);
__host__ long rocshmem_ctx_long_atomic_swap(
rocshmem_ctx_t ctx, long *dest, long value, int pe);
__host__ long rocshmem_long_atomic_swap(
long *dest, long value, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_swap(
rocshmem_ctx_t ctx, long long *dest, long long value, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_swap(
long long *dest, long long value, int pe);
__host__ long long rocshmem_ctx_longlong_atomic_swap(
rocshmem_ctx_t ctx, long long *dest, long long value, int pe);
__host__ long long rocshmem_longlong_atomic_swap(
long long *dest, long long value, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_swap(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_swap(
unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_ctx_uint_atomic_swap(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_uint_atomic_swap(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_swap(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_swap(
unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ctx_ulong_atomic_swap(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ulong_atomic_swap(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_swap(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_swap(
unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_swap(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ulonglong_atomic_swap(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_swap(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_swap(
int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_ctx_int32_atomic_swap(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_int32_atomic_swap(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_swap(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_swap(
int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_ctx_int64_atomic_swap(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_int64_atomic_swap(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_swap(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_swap(
uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_ctx_uint32_atomic_swap(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_uint32_atomic_swap(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_swap(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_swap(
uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_ctx_uint64_atomic_swap(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_uint64_atomic_swap(
uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_swap(
rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_swap(
size_t *dest, size_t value, int pe);
__host__ size_t rocshmem_ctx_size_atomic_swap(
rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe);
__host__ size_t rocshmem_size_atomic_swap(
size_t *dest, size_t value, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_swap(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_swap(
ptrdiff_t *dest, ptrdiff_t value, int pe);
__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_swap(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe);
__host__ ptrdiff_t rocshmem_ptrdiff_atomic_swap(
ptrdiff_t *dest, ptrdiff_t value, int pe);
/**
* @name SHMEM_ATOMIC_FETCH_INC
* @brief Atomically add 1 to \p dest on \p pe. The operation
* returns the older value of \p dest to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] pe PE of the remote process.
*
* @return The old value of \p dest before it was incremented by 1.
*/
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_fetch_inc(
rocshmem_ctx_t ctx, int *dest, int pe);
__device__ ATTR_NO_INLINE int rocshmem_int_atomic_fetch_inc(
int *dest, int pe);
__host__ int rocshmem_ctx_int_atomic_fetch_inc(
rocshmem_ctx_t ctx, int *dest, int pe);
__host__ int rocshmem_int_atomic_fetch_inc(
int *dest, int pe);
__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_fetch_inc(
rocshmem_ctx_t ctx, long *dest, int pe);
__device__ ATTR_NO_INLINE long rocshmem_long_atomic_fetch_inc(
long *dest, int pe);
__host__ long rocshmem_ctx_long_atomic_fetch_inc(
rocshmem_ctx_t ctx, long *dest, int pe);
__host__ long rocshmem_long_atomic_fetch_inc(
long *dest, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_fetch_inc(
rocshmem_ctx_t ctx, long long *dest, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_fetch_inc(
long long *dest, int pe);
__host__ long long rocshmem_ctx_longlong_atomic_fetch_inc(
rocshmem_ctx_t ctx, long long *dest, int pe);
__host__ long long rocshmem_longlong_atomic_fetch_inc(
long long *dest, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_inc(
rocshmem_ctx_t ctx, unsigned int *dest, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_inc(
unsigned int *dest, int pe);
__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_inc(
rocshmem_ctx_t ctx, unsigned int *dest, int pe);
__host__ unsigned int rocshmem_uint_atomic_fetch_inc(
unsigned int *dest, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_inc(
rocshmem_ctx_t ctx, unsigned long *dest, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_inc(
unsigned long *dest, int pe);
__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_inc(
rocshmem_ctx_t ctx, unsigned long *dest, int pe);
__host__ unsigned long rocshmem_ulong_atomic_fetch_inc(
unsigned long *dest, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_inc(
rocshmem_ctx_t ctx, unsigned long long *dest, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_inc(
unsigned long long *dest, int pe);
__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_inc(
rocshmem_ctx_t ctx, unsigned long long *dest, int pe);
__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_inc(
unsigned long long *dest, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_inc(
rocshmem_ctx_t ctx, int32_t *dest, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_inc(
int32_t *dest, int pe);
__host__ int32_t rocshmem_ctx_int32_atomic_fetch_inc(
rocshmem_ctx_t ctx, int32_t *dest, int pe);
__host__ int32_t rocshmem_int32_atomic_fetch_inc(
int32_t *dest, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_inc(
rocshmem_ctx_t ctx, int64_t *dest, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_inc(
int64_t *dest, int pe);
__host__ int64_t rocshmem_ctx_int64_atomic_fetch_inc(
rocshmem_ctx_t ctx, int64_t *dest, int pe);
__host__ int64_t rocshmem_int64_atomic_fetch_inc(
int64_t *dest, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_inc(
rocshmem_ctx_t ctx, uint32_t *dest, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_inc(
uint32_t *dest, int pe);
__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_inc(
rocshmem_ctx_t ctx, uint32_t *dest, int pe);
__host__ uint32_t rocshmem_uint32_atomic_fetch_inc(
uint32_t *dest, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_inc(
rocshmem_ctx_t ctx, uint64_t *dest, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_inc(
uint64_t *dest, int pe);
__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_inc(
rocshmem_ctx_t ctx, uint64_t *dest, int pe);
__host__ uint64_t rocshmem_uint64_atomic_fetch_inc(
uint64_t *dest, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_fetch_inc(
rocshmem_ctx_t ctx, size_t *dest, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_fetch_inc(
size_t *dest, int pe);
__host__ size_t rocshmem_ctx_size_atomic_fetch_inc(
rocshmem_ctx_t ctx, size_t *dest, int pe);
__host__ size_t rocshmem_size_atomic_fetch_inc(
size_t *dest, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_inc(
rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_fetch_inc(
ptrdiff_t *dest, int pe);
__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_inc(
rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe);
__host__ ptrdiff_t rocshmem_ptrdiff_atomic_fetch_inc(
ptrdiff_t *dest, int pe);
/**
* @name SHMEM_ATOMIC_INC
* @brief Atomically add 1 to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_atomic_inc(
rocshmem_ctx_t ctx, int *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_atomic_inc(
int *dest, int pe);
__host__ void rocshmem_ctx_int_atomic_inc(
rocshmem_ctx_t ctx, int *dest, int pe);
__host__ void rocshmem_int_atomic_inc(
int *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_atomic_inc(
rocshmem_ctx_t ctx, long *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_atomic_inc(
long *dest, int pe);
__host__ void rocshmem_ctx_long_atomic_inc(
rocshmem_ctx_t ctx, long *dest, int pe);
__host__ void rocshmem_long_atomic_inc(
long *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_atomic_inc(
rocshmem_ctx_t ctx, long long *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_atomic_inc(
long long *dest, int pe);
__host__ void rocshmem_ctx_longlong_atomic_inc(
rocshmem_ctx_t ctx, long long *dest, int pe);
__host__ void rocshmem_longlong_atomic_inc(
long long *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_inc(
rocshmem_ctx_t ctx, unsigned int *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_inc(
unsigned int *dest, int pe);
__host__ void rocshmem_ctx_uint_atomic_inc(
rocshmem_ctx_t ctx, unsigned int *dest, int pe);
__host__ void rocshmem_uint_atomic_inc(
unsigned int *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_inc(
rocshmem_ctx_t ctx, unsigned long *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_inc(
unsigned long *dest, int pe);
__host__ void rocshmem_ctx_ulong_atomic_inc(
rocshmem_ctx_t ctx, unsigned long *dest, int pe);
__host__ void rocshmem_ulong_atomic_inc(
unsigned long *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_inc(
rocshmem_ctx_t ctx, unsigned long long *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_inc(
unsigned long long *dest, int pe);
__host__ void rocshmem_ctx_ulonglong_atomic_inc(
rocshmem_ctx_t ctx, unsigned long long *dest, int pe);
__host__ void rocshmem_ulonglong_atomic_inc(
unsigned long long *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_inc(
rocshmem_ctx_t ctx, int32_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_inc(
int32_t *dest, int pe);
__host__ void rocshmem_ctx_int32_atomic_inc(
rocshmem_ctx_t ctx, int32_t *dest, int pe);
__host__ void rocshmem_int32_atomic_inc(
int32_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_inc(
rocshmem_ctx_t ctx, int64_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_inc(
int64_t *dest, int pe);
__host__ void rocshmem_ctx_int64_atomic_inc(
rocshmem_ctx_t ctx, int64_t *dest, int pe);
__host__ void rocshmem_int64_atomic_inc(
int64_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_inc(
rocshmem_ctx_t ctx, uint32_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_inc(
uint32_t *dest, int pe);
__host__ void rocshmem_ctx_uint32_atomic_inc(
rocshmem_ctx_t ctx, uint32_t *dest, int pe);
__host__ void rocshmem_uint32_atomic_inc(
uint32_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_inc(
rocshmem_ctx_t ctx, uint64_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_inc(
uint64_t *dest, int pe);
__host__ void rocshmem_ctx_uint64_atomic_inc(
rocshmem_ctx_t ctx, uint64_t *dest, int pe);
__host__ void rocshmem_uint64_atomic_inc(
uint64_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_size_atomic_inc(
rocshmem_ctx_t ctx, size_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_size_atomic_inc(
size_t *dest, int pe);
__host__ void rocshmem_ctx_size_atomic_inc(
rocshmem_ctx_t ctx, size_t *dest, int pe);
__host__ void rocshmem_size_atomic_inc(
size_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ptrdiff_atomic_inc(
rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ptrdiff_atomic_inc(
ptrdiff_t *dest, int pe);
__host__ void rocshmem_ctx_ptrdiff_atomic_inc(
rocshmem_ctx_t ctx, ptrdiff_t *dest, int pe);
__host__ void rocshmem_ptrdiff_atomic_inc(
ptrdiff_t *dest, int pe);
/**
* @name SHMEM_ATOMIC_FETCH_ADD
* @brief Atomically add the value \p val to \p dest on \p pe. The operation
* returns the older value of \p dest to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return The old value of \p dest before the \p val was added.
*/
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_atomic_fetch_add(
rocshmem_ctx_t ctx, int *dest, int value, int pe);
__device__ ATTR_NO_INLINE int rocshmem_int_atomic_fetch_add(
int *dest, int value, int pe);
__host__ int rocshmem_ctx_int_atomic_fetch_add(
rocshmem_ctx_t ctx, int *dest, int value, int pe);
__host__ int rocshmem_int_atomic_fetch_add(
int *dest, int value, int pe);
__device__ ATTR_NO_INLINE long rocshmem_ctx_long_atomic_fetch_add(
rocshmem_ctx_t ctx, long *dest, long value, int pe);
__device__ ATTR_NO_INLINE long rocshmem_long_atomic_fetch_add(
long *dest, long value, int pe);
__host__ long rocshmem_ctx_long_atomic_fetch_add(
rocshmem_ctx_t ctx, long *dest, long value, int pe);
__host__ long rocshmem_long_atomic_fetch_add(
long *dest, long value, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_atomic_fetch_add(
rocshmem_ctx_t ctx, long long *dest, long long value, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_longlong_atomic_fetch_add(
long long *dest, long long value, int pe);
__host__ long long rocshmem_ctx_longlong_atomic_fetch_add(
rocshmem_ctx_t ctx, long long *dest, long long value, int pe);
__host__ long long rocshmem_longlong_atomic_fetch_add(
long long *dest, long long value, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_add(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_add(
unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_add(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_uint_atomic_fetch_add(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_add(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_add(
unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_add(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ulong_atomic_fetch_add(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_add(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_add(
unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_add(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_add(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_add(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_add(
int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_ctx_int32_atomic_fetch_add(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_int32_atomic_fetch_add(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_add(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_add(
int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_ctx_int64_atomic_fetch_add(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_int64_atomic_fetch_add(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_add(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_add(
uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_add(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_uint32_atomic_fetch_add(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_add(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_add(
uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_add(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_uint64_atomic_fetch_add(
uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_ctx_size_atomic_fetch_add(
rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe);
__device__ ATTR_NO_INLINE size_t rocshmem_size_atomic_fetch_add(
size_t *dest, size_t value, int pe);
__host__ size_t rocshmem_ctx_size_atomic_fetch_add(
rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe);
__host__ size_t rocshmem_size_atomic_fetch_add(
size_t *dest, size_t value, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_add(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe);
__device__ ATTR_NO_INLINE ptrdiff_t rocshmem_ptrdiff_atomic_fetch_add(
ptrdiff_t *dest, ptrdiff_t value, int pe);
__host__ ptrdiff_t rocshmem_ctx_ptrdiff_atomic_fetch_add(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe);
__host__ ptrdiff_t rocshmem_ptrdiff_atomic_fetch_add(
ptrdiff_t *dest, ptrdiff_t value, int pe);
/**
* @name SHMEM_ATOMIC_ADD
* @brief Atomically add the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_atomic_add(
rocshmem_ctx_t ctx, int *dest, int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_atomic_add(
int *dest, int value, int pe);
__host__ void rocshmem_ctx_int_atomic_add(
rocshmem_ctx_t ctx, int *dest, int value, int pe);
__host__ void rocshmem_int_atomic_add(
int *dest, int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_atomic_add(
rocshmem_ctx_t ctx, long *dest, long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_atomic_add(
long *dest, long value, int pe);
__host__ void rocshmem_ctx_long_atomic_add(
rocshmem_ctx_t ctx, long *dest, long value, int pe);
__host__ void rocshmem_long_atomic_add(
long *dest, long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_atomic_add(
rocshmem_ctx_t ctx, long long *dest, long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_atomic_add(
long long *dest, long long value, int pe);
__host__ void rocshmem_ctx_longlong_atomic_add(
rocshmem_ctx_t ctx, long long *dest, long long value, int pe);
__host__ void rocshmem_longlong_atomic_add(
long long *dest, long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_add(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_add(
unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_ctx_uint_atomic_add(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_uint_atomic_add(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_add(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_add(
unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ctx_ulong_atomic_add(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ulong_atomic_add(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_add(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_add(
unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ctx_ulonglong_atomic_add(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ulonglong_atomic_add(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_add(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_add(
int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_ctx_int32_atomic_add(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_int32_atomic_add(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_add(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_add(
int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_ctx_int64_atomic_add(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_int64_atomic_add(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_add(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_add(
uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_ctx_uint32_atomic_add(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_uint32_atomic_add(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_add(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_add(
uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_ctx_uint64_atomic_add(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_uint64_atomic_add(
uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_size_atomic_add(
rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_size_atomic_add(
size_t *dest, size_t value, int pe);
__host__ void rocshmem_ctx_size_atomic_add(
rocshmem_ctx_t ctx, size_t *dest, size_t value, int pe);
__host__ void rocshmem_size_atomic_add(
size_t *dest, size_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ptrdiff_atomic_add(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ptrdiff_atomic_add(
ptrdiff_t *dest, ptrdiff_t value, int pe);
__host__ void rocshmem_ctx_ptrdiff_atomic_add(
rocshmem_ctx_t ctx, ptrdiff_t *dest, ptrdiff_t value, int pe);
__host__ void rocshmem_ptrdiff_atomic_add(
ptrdiff_t *dest, ptrdiff_t value, int pe);
/**
* @name SHMEM_ATOMIC_FETCH_AND
* @brief Atomically bitwise-and the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/
__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_and(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_and(
unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_and(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_uint_atomic_fetch_and(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_and(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_and(
unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_and(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ulong_atomic_fetch_and(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_and(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_and(
unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_and(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_and(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_and(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_and(
int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_ctx_int32_atomic_fetch_and(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_int32_atomic_fetch_and(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_and(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_and(
int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_ctx_int64_atomic_fetch_and(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_int64_atomic_fetch_and(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_and(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_and(
uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_and(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_uint32_atomic_fetch_and(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_and(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_and(
uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_and(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_uint64_atomic_fetch_and(
uint64_t *dest, uint64_t value, int pe);
/**
* @name SHMEM_ATOMIC_AND
* @brief Atomically bitwise-and the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_and(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_and(
unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_ctx_uint_atomic_and(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_uint_atomic_and(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_and(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_and(
unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ctx_ulong_atomic_and(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ulong_atomic_and(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_and(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_and(
unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ctx_ulonglong_atomic_and(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ulonglong_atomic_and(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_and(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_and(
int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_ctx_int32_atomic_and(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_int32_atomic_and(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_and(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_and(
int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_ctx_int64_atomic_and(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_int64_atomic_and(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_and(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_and(
uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_ctx_uint32_atomic_and(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_uint32_atomic_and(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_and(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_and(
uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_ctx_uint64_atomic_and(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_uint64_atomic_and(
uint64_t *dest, uint64_t value, int pe);
/**
* @name SHMEM_ATOMIC_FETCH_OR
* @brief Atomically bitwise-or the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/
__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_or(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_or(
unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_or(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_uint_atomic_fetch_or(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_or(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_or(
unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_or(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ulong_atomic_fetch_or(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_or(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_or(
unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_or(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_or(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_or(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_or(
int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_ctx_int32_atomic_fetch_or(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_int32_atomic_fetch_or(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_or(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_or(
int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_ctx_int64_atomic_fetch_or(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_int64_atomic_fetch_or(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_or(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_or(
uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_or(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_uint32_atomic_fetch_or(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_or(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_or(
uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_or(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_uint64_atomic_fetch_or(
uint64_t *dest, uint64_t value, int pe);
/**
* @name SHMEM_ATOMIC_OR
* @brief Atomically bitwise-or the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_or(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_or(
unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_ctx_uint_atomic_or(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_uint_atomic_or(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_or(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_or(
unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ctx_ulong_atomic_or(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ulong_atomic_or(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_or(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_or(
unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ctx_ulonglong_atomic_or(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ulonglong_atomic_or(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_or(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_or(
int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_ctx_int32_atomic_or(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_int32_atomic_or(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_or(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_or(
int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_ctx_int64_atomic_or(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_int64_atomic_or(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_or(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_or(
uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_ctx_uint32_atomic_or(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_uint32_atomic_or(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_or(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_or(
uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_ctx_uint64_atomic_or(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_uint64_atomic_or(
uint64_t *dest, uint64_t value, int pe);
/**
* @name SHMEM_ATOMIC_FETCH_XOR
* @brief Atomically bitwise-xor the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/
__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_atomic_fetch_xor(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_atomic_fetch_xor(
unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_ctx_uint_atomic_fetch_xor(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ unsigned int rocshmem_uint_atomic_fetch_xor(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_atomic_fetch_xor(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_atomic_fetch_xor(
unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ctx_ulong_atomic_fetch_xor(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ unsigned long rocshmem_ulong_atomic_fetch_xor(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_xor(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_atomic_fetch_xor(
unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ctx_ulonglong_atomic_fetch_xor(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ unsigned long long rocshmem_ulonglong_atomic_fetch_xor(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_ctx_int32_atomic_fetch_xor(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int32_t rocshmem_int32_atomic_fetch_xor(
int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_ctx_int32_atomic_fetch_xor(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ int32_t rocshmem_int32_atomic_fetch_xor(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_ctx_int64_atomic_fetch_xor(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE int64_t rocshmem_int64_atomic_fetch_xor(
int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_ctx_int64_atomic_fetch_xor(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ int64_t rocshmem_int64_atomic_fetch_xor(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_ctx_uint32_atomic_fetch_xor(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint32_t rocshmem_uint32_atomic_fetch_xor(
uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_ctx_uint32_atomic_fetch_xor(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ uint32_t rocshmem_uint32_atomic_fetch_xor(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_ctx_uint64_atomic_fetch_xor(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_uint64_atomic_fetch_xor(
uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_ctx_uint64_atomic_fetch_xor(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ uint64_t rocshmem_uint64_atomic_fetch_xor(
uint64_t *dest, uint64_t value, int pe);
/**
* @name SHMEM_ATOMIC_XOR
* @brief Atomically bitwise-xor the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_atomic_xor(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_atomic_xor(
unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_ctx_uint_atomic_xor(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_uint_atomic_xor(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_atomic_xor(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_atomic_xor(
unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ctx_ulong_atomic_xor(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ulong_atomic_xor(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_atomic_xor(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_atomic_xor(
unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ctx_ulonglong_atomic_xor(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ulonglong_atomic_xor(
unsigned long long *dest, unsigned long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int32_atomic_xor(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int32_atomic_xor(
int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_ctx_int32_atomic_xor(
rocshmem_ctx_t ctx, int32_t *dest, int32_t value, int pe);
__host__ void rocshmem_int32_atomic_xor(
int32_t *dest, int32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int64_atomic_xor(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int64_atomic_xor(
int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_ctx_int64_atomic_xor(
rocshmem_ctx_t ctx, int64_t *dest, int64_t value, int pe);
__host__ void rocshmem_int64_atomic_xor(
int64_t *dest, int64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint32_atomic_xor(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint32_atomic_xor(
uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_ctx_uint32_atomic_xor(
rocshmem_ctx_t ctx, uint32_t *dest, uint32_t value, int pe);
__host__ void rocshmem_uint32_atomic_xor(
uint32_t *dest, uint32_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint64_atomic_xor(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint64_atomic_xor(
uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_ctx_uint64_atomic_xor(
rocshmem_ctx_t ctx, uint64_t *dest, uint64_t value, int pe);
__host__ void rocshmem_uint64_atomic_xor(
uint64_t *dest, uint64_t value, int pe);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
namespace rocshmem {
/**
* @name SHMEM_ALLTOALL
* @brief Exchanges a fixed amount of contiguous data blocks between all pairs
* of PEs participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks transferred per pair of PEs.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
/**
* @name SHMEM_BROADCAST
* @brief Perform a broadcast between PEs in the active set. The caller
* is blocked until the broadcase completes.
*
* This function must be called as a work-group collective.
*
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelement Size of the buffer to participate in the broadcast.
* @param[in] PE_root Zero-based ordinal of the PE, with respect to the
active set, from which the data is copied
* @param[in] PE_start PE to start the reduction.
* @param[in] logPE_stride Stride of PEs participating in the reduction.
* @param[in] PE_size Number PEs participating in the reduction.
* @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must
be of size at least ROCSHMEM_REDUCE_SYNC_SIZE.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_float_broadcast(
rocshmem_ctx_t ctx, float *dest, const float *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_float_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_double_broadcast(
rocshmem_ctx_t ctx, double *dest, const double *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_double_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_char_broadcast(
rocshmem_ctx_t ctx, char *dest, const char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_char_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_schar_broadcast(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_schar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_short_broadcast(
rocshmem_ctx_t ctx, short *dest, const short *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_short_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_int_broadcast(
rocshmem_ctx_t ctx, int *dest, const int *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_int_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_long_broadcast(
rocshmem_ctx_t ctx, long *dest, const long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_long_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_longlong_broadcast(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_longlong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uchar_broadcast(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_uchar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ushort_broadcast(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ushort_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uint_broadcast(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_uint_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulong_broadcast(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ulong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulonglong_broadcast(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ulonglong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems, int pe_root);
/**
* @name SHMEM_FCOLLECT
* @brief Concatenates blocks of data from multiple PEs to an array in every
* PE participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks in source array.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
/**
* @name SHMEM_REDUCTIONS
* @brief Perform an allreduce between PEs in the active set. The caller
* is blocked until the reduction completes.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nreduce Size of the buffer to participate in the reduction.
*
* @return int (Zero on successful local completion. Nonzero otherwise.)
*/
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
/**
* @brief kernel for performing a barrier synchronization.
* Caller enqueues the kernel on given stream
*
* @return void
*/
__global__ ATTR_NO_INLINE void rocshmem_barrier_all_kernel();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_barrier_all();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_barrier_all_wave();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_barrier_all_wg();
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_sync_all();
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_sync_all_wave();
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_sync_all_wg();
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wave(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
namespace rocshmem {
/**
* @name SHMEM_WAIT_UNTIL
* @brief Block the caller until the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return void
*/
__device__ void rocshmem_float_wait_until(
float *ivars, int cmp, float val);
__device__ size_t rocshmem_float_wait_until_any(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ void rocshmem_float_wait_until_all(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_some(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_any_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ void rocshmem_float_wait_until_all_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_some_vector(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until(
float *ivars, int cmp, float val);
__host__ size_t rocshmem_float_wait_until_any(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until_all(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_some(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_any_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until_all_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_some_vector(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__device__ void rocshmem_double_wait_until(
double *ivars, int cmp, double val);
__device__ size_t rocshmem_double_wait_until_any(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ void rocshmem_double_wait_until_all(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_some(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_any_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ void rocshmem_double_wait_until_all_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_some_vector(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until(
double *ivars, int cmp, double val);
__host__ size_t rocshmem_double_wait_until_any(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until_all(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_some(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_any_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until_all_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_some_vector(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__device__ void rocshmem_char_wait_until(
char *ivars, int cmp, char val);
__device__ size_t rocshmem_char_wait_until_any(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ void rocshmem_char_wait_until_all(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_some(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_any_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ void rocshmem_char_wait_until_all_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_some_vector(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until(
char *ivars, int cmp, char val);
__host__ size_t rocshmem_char_wait_until_any(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until_all(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_some(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_any_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until_all_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_some_vector(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__device__ void rocshmem_schar_wait_until(
signed char *ivars, int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_any(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ void rocshmem_schar_wait_until_all(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_some(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_any_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ void rocshmem_schar_wait_until_all_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_some_vector(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until(
signed char *ivars, int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_any(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until_all(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_some(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_any_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until_all_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_some_vector(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__device__ void rocshmem_short_wait_until(
short *ivars, int cmp, short val);
__device__ size_t rocshmem_short_wait_until_any(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ void rocshmem_short_wait_until_all(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_some(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_any_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ void rocshmem_short_wait_until_all_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_some_vector(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until(
short *ivars, int cmp, short val);
__host__ size_t rocshmem_short_wait_until_any(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until_all(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_some(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_any_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until_all_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_some_vector(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__device__ void rocshmem_int_wait_until(
int *ivars, int cmp, int val);
__device__ size_t rocshmem_int_wait_until_any(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ void rocshmem_int_wait_until_all(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_some(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_any_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ void rocshmem_int_wait_until_all_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_some_vector(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until(
int *ivars, int cmp, int val);
__host__ size_t rocshmem_int_wait_until_any(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until_all(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_some(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_any_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until_all_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_some_vector(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__device__ void rocshmem_long_wait_until(
long *ivars, int cmp, long val);
__device__ size_t rocshmem_long_wait_until_any(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ void rocshmem_long_wait_until_all(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_some(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_any_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ void rocshmem_long_wait_until_all_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_some_vector(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until(
long *ivars, int cmp, long val);
__host__ size_t rocshmem_long_wait_until_any(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until_all(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_some(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_any_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until_all_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_some_vector(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__device__ void rocshmem_longlong_wait_until(
long long *ivars, int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_any(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ void rocshmem_longlong_wait_until_all(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_some(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_any_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ void rocshmem_longlong_wait_until_all_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_some_vector(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until(
long long *ivars, int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_any(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until_all(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_some(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_any_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until_all_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_some_vector(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__device__ void rocshmem_uchar_wait_until(
unsigned char *ivars, int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_any(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_uchar_wait_until_all(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_some(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_any_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_uchar_wait_until_all_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_some_vector(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until(
unsigned char *ivars, int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_any(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until_all(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_some(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_any_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until_all_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_some_vector(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_ushort_wait_until(
unsigned short *ivars, int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_any(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_ushort_wait_until_all(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_some(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_any_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_ushort_wait_until_all_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_some_vector(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until(
unsigned short *ivars, int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_any(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until_all(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_some(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_any_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until_all_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_some_vector(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_uint_wait_until(
unsigned int *ivars, int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_any(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_uint_wait_until_all(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_some(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_any_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_uint_wait_until_all_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_some_vector(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until(
unsigned int *ivars, int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_any(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until_all(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_some(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_any_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until_all_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_some_vector(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_ulong_wait_until(
unsigned long *ivars, int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_any(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulong_wait_until_all(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_some(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_any_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulong_wait_until_all_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_some_vector(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until(
unsigned long *ivars, int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_any(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until_all(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_some(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_any_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until_all_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_some_vector(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulonglong_wait_until(
unsigned long long *ivars, int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_any(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ void rocshmem_ulonglong_wait_until_all(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_some(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_any_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ void rocshmem_ulonglong_wait_until_all_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_some_vector(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until(
unsigned long long *ivars, int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_any(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until_all(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_some(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_any_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until_all_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_some_vector(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
/**
* @name SHMEM_TEST
* @brief test if the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return 1 if the evaluation is true else 0
*/
__device__ int rocshmem_float_test(
float *ivars, int cmp, float val);
__host__ int rocshmem_float_test(
float *ivars, int cmp, float val);
__device__ int rocshmem_double_test(
double *ivars, int cmp, double val);
__host__ int rocshmem_double_test(
double *ivars, int cmp, double val);
__device__ int rocshmem_char_test(
char *ivars, int cmp, char val);
__host__ int rocshmem_char_test(
char *ivars, int cmp, char val);
__device__ int rocshmem_schar_test(
signed char *ivars, int cmp, signed char val);
__host__ int rocshmem_schar_test(
signed char *ivars, int cmp, signed char val);
__device__ int rocshmem_short_test(
short *ivars, int cmp, short val);
__host__ int rocshmem_short_test(
short *ivars, int cmp, short val);
__device__ int rocshmem_int_test(
int *ivars, int cmp, int val);
__host__ int rocshmem_int_test(
int *ivars, int cmp, int val);
__device__ int rocshmem_long_test(
long *ivars, int cmp, long val);
__host__ int rocshmem_long_test(
long *ivars, int cmp, long val);
__device__ int rocshmem_longlong_test(
long long *ivars, int cmp, long long val);
__host__ int rocshmem_longlong_test(
long long *ivars, int cmp, long long val);
__device__ int rocshmem_uchar_test(
unsigned char *ivars, int cmp, unsigned char val);
__host__ int rocshmem_uchar_test(
unsigned char *ivars, int cmp, unsigned char val);
__device__ int rocshmem_ushort_test(
unsigned short *ivars, int cmp, unsigned short val);
__host__ int rocshmem_ushort_test(
unsigned short *ivars, int cmp, unsigned short val);
__device__ int rocshmem_uint_test(
unsigned int *ivars, int cmp, unsigned int val);
__host__ int rocshmem_uint_test(
unsigned int *ivars, int cmp, unsigned int val);
__device__ int rocshmem_ulong_test(
unsigned long *ivars, int cmp, unsigned long val);
__host__ int rocshmem_ulong_test(
unsigned long *ivars, int cmp, unsigned long val);
__device__ int rocshmem_ulonglong_test(
unsigned long long *ivars, int cmp, unsigned long long val);
__host__ int rocshmem_ulonglong_test(
unsigned long long *ivars, int cmp, unsigned long long val);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
namespace rocshmem {
/**
* @name SHMEM_PUT
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put(
float *dest, const float *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_float_put(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__host__ void rocshmem_float_put(float *dest,
const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put(
double *dest, const double *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_double_put(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__host__ void rocshmem_double_put(double *dest,
const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put(
char *dest, const char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_char_put(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__host__ void rocshmem_char_put(char *dest,
const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put(
signed char *dest, const signed char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_schar_put(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__host__ void rocshmem_schar_put(signed char *dest,
const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put(
short *dest, const short *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_short_put(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__host__ void rocshmem_short_put(short *dest,
const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put(
int *dest, const int *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_int_put(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__host__ void rocshmem_int_put(int *dest,
const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put(
long *dest, const long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_long_put(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__host__ void rocshmem_long_put(long *dest,
const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put(
long long *dest, const long long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_longlong_put(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__host__ void rocshmem_longlong_put(long long *dest,
const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_uchar_put(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__host__ void rocshmem_uchar_put(unsigned char *dest,
const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ushort_put(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__host__ void rocshmem_ushort_put(unsigned short *dest,
const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_uint_put(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__host__ void rocshmem_uint_put(unsigned int *dest,
const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ulong_put(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__host__ void rocshmem_ulong_put(unsigned long *dest,
const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ulonglong_put(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__host__ void rocshmem_ulonglong_put(unsigned long long *dest,
const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem(rocshmem_ctx_t ctx,
void *dest,
const void *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem(void *dest, const void *source,
size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into __host__ rocshmem_quiet() if remote completion is required.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__host__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems, int pe);
__host__ void rocshmem_putmem(void *dest, const void *source, size_t nelems,
int pe);
/**
* @name SHMEM_P
* @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe.
* The caller must call into rocshmem_quiet() if remote completion is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] value Value to write to dest at \p pe.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_p(
rocshmem_ctx_t ctx, float *dest, float value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_p(
float *dest, float value, int pe);
__host__ void rocshmem_ctx_float_p(
rocshmem_ctx_t ctx, float *dest, float value,
int pe);
__host__ void rocshmem_float_p(
float *dest, float value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_p(
rocshmem_ctx_t ctx, double *dest, double value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_p(
double *dest, double value, int pe);
__host__ void rocshmem_ctx_double_p(
rocshmem_ctx_t ctx, double *dest, double value,
int pe);
__host__ void rocshmem_double_p(
double *dest, double value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_p(
rocshmem_ctx_t ctx, char *dest, char value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_p(
char *dest, char value, int pe);
__host__ void rocshmem_ctx_char_p(
rocshmem_ctx_t ctx, char *dest, char value,
int pe);
__host__ void rocshmem_char_p(
char *dest, char value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_p(
rocshmem_ctx_t ctx, signed char *dest, signed char value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_p(
signed char *dest, signed char value, int pe);
__host__ void rocshmem_ctx_schar_p(
rocshmem_ctx_t ctx, signed char *dest, signed char value,
int pe);
__host__ void rocshmem_schar_p(
signed char *dest, signed char value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_p(
rocshmem_ctx_t ctx, short *dest, short value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_p(
short *dest, short value, int pe);
__host__ void rocshmem_ctx_short_p(
rocshmem_ctx_t ctx, short *dest, short value,
int pe);
__host__ void rocshmem_short_p(
short *dest, short value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_p(
rocshmem_ctx_t ctx, int *dest, int value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_p(
int *dest, int value, int pe);
__host__ void rocshmem_ctx_int_p(
rocshmem_ctx_t ctx, int *dest, int value,
int pe);
__host__ void rocshmem_int_p(
int *dest, int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_p(
rocshmem_ctx_t ctx, long *dest, long value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_p(
long *dest, long value, int pe);
__host__ void rocshmem_ctx_long_p(
rocshmem_ctx_t ctx, long *dest, long value,
int pe);
__host__ void rocshmem_long_p(
long *dest, long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_p(
rocshmem_ctx_t ctx, long long *dest, long long value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_p(
long long *dest, long long value, int pe);
__host__ void rocshmem_ctx_longlong_p(
rocshmem_ctx_t ctx, long long *dest, long long value,
int pe);
__host__ void rocshmem_longlong_p(
long long *dest, long long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_p(
rocshmem_ctx_t ctx, unsigned char *dest, unsigned char value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_p(
unsigned char *dest, unsigned char value, int pe);
__host__ void rocshmem_ctx_uchar_p(
rocshmem_ctx_t ctx, unsigned char *dest, unsigned char value,
int pe);
__host__ void rocshmem_uchar_p(
unsigned char *dest, unsigned char value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_p(
rocshmem_ctx_t ctx, unsigned short *dest, unsigned short value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_p(
unsigned short *dest, unsigned short value, int pe);
__host__ void rocshmem_ctx_ushort_p(
rocshmem_ctx_t ctx, unsigned short *dest, unsigned short value,
int pe);
__host__ void rocshmem_ushort_p(
unsigned short *dest, unsigned short value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_p(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_p(
unsigned int *dest, unsigned int value, int pe);
__host__ void rocshmem_ctx_uint_p(
rocshmem_ctx_t ctx, unsigned int *dest, unsigned int value,
int pe);
__host__ void rocshmem_uint_p(
unsigned int *dest, unsigned int value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_p(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_p(
unsigned long *dest, unsigned long value, int pe);
__host__ void rocshmem_ctx_ulong_p(
rocshmem_ctx_t ctx, unsigned long *dest, unsigned long value,
int pe);
__host__ void rocshmem_ulong_p(
unsigned long *dest, unsigned long value, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_p(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value,
int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_p(
unsigned long long *dest, unsigned long long value, int pe);
__host__ void rocshmem_ctx_ulonglong_p(
rocshmem_ctx_t ctx, unsigned long long *dest, unsigned long long value,
int pe);
__host__ void rocshmem_ulonglong_p(
unsigned long long *dest, unsigned long long value, int pe);
/**
* @name SHMEM_GET
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_get(
float *dest, const float *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_float_get(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__host__ void rocshmem_float_get(float *dest,
const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_get(
double *dest, const double *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_double_get(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__host__ void rocshmem_double_get(double *dest,
const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_get(
char *dest, const char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_char_get(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__host__ void rocshmem_char_get(char *dest,
const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_get(
signed char *dest, const signed char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_schar_get(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__host__ void rocshmem_schar_get(signed char *dest,
const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_get(
short *dest, const short *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_short_get(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__host__ void rocshmem_short_get(short *dest,
const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_get(
int *dest, const int *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_int_get(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__host__ void rocshmem_int_get(int *dest,
const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_get(
long *dest, const long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_long_get(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__host__ void rocshmem_long_get(long *dest,
const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_get(
long long *dest, const long long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_longlong_get(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__host__ void rocshmem_longlong_get(long long *dest,
const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_get(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_uchar_get(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__host__ void rocshmem_uchar_get(unsigned char *dest,
const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_get(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ushort_get(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__host__ void rocshmem_ushort_get(unsigned short *dest,
const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_get(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_uint_get(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__host__ void rocshmem_uint_get(unsigned int *dest,
const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_get(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ulong_get(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__host__ void rocshmem_ulong_get(unsigned long *dest,
const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ulonglong_get(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__host__ void rocshmem_ulonglong_get(unsigned long long *dest,
const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem(rocshmem_ctx_t ctx,
void *dest,
const void *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_getmem(void *dest, const void *source,
size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__host__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems, int pe);
__host__ void rocshmem_getmem(void *dest, const void *source, size_t nelems,
int pe);
/**
* @name SHMEM_G
* @brief reads and returns single value from \p source at \p pe.
* The calling work-group/thread will block until the operation completes.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] source Source address. Must be an address on the symmetric
* heap.
* @param[in] pe PE of the remote process.
*
* @return the value read from remote \p source at \p pe.
*/
__device__ ATTR_NO_INLINE float rocshmem_ctx_float_g(
rocshmem_ctx_t ctx, const float *source, int pe);
__device__ ATTR_NO_INLINE float rocshmem_float_g(
const float *source, int pe);
__host__ float rocshmem_ctx_float_g(
rocshmem_ctx_t ctx, const float *source, int pe);
__host__ float rocshmem_float_g(
const float *source, int pe);
__device__ ATTR_NO_INLINE double rocshmem_ctx_double_g(
rocshmem_ctx_t ctx, const double *source, int pe);
__device__ ATTR_NO_INLINE double rocshmem_double_g(
const double *source, int pe);
__host__ double rocshmem_ctx_double_g(
rocshmem_ctx_t ctx, const double *source, int pe);
__host__ double rocshmem_double_g(
const double *source, int pe);
__device__ ATTR_NO_INLINE char rocshmem_ctx_char_g(
rocshmem_ctx_t ctx, const char *source, int pe);
__device__ ATTR_NO_INLINE char rocshmem_char_g(
const char *source, int pe);
__host__ char rocshmem_ctx_char_g(
rocshmem_ctx_t ctx, const char *source, int pe);
__host__ char rocshmem_char_g(
const char *source, int pe);
__device__ ATTR_NO_INLINE signed char rocshmem_ctx_schar_g(
rocshmem_ctx_t ctx, const signed char *source, int pe);
__device__ ATTR_NO_INLINE signed char rocshmem_schar_g(
const signed char *source, int pe);
__host__ signed char rocshmem_ctx_schar_g(
rocshmem_ctx_t ctx, const signed char *source, int pe);
__host__ signed char rocshmem_schar_g(
const signed char *source, int pe);
__device__ ATTR_NO_INLINE short rocshmem_ctx_short_g(
rocshmem_ctx_t ctx, const short *source, int pe);
__device__ ATTR_NO_INLINE short rocshmem_short_g(
const short *source, int pe);
__host__ short rocshmem_ctx_short_g(
rocshmem_ctx_t ctx, const short *source, int pe);
__host__ short rocshmem_short_g(
const short *source, int pe);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_g(
rocshmem_ctx_t ctx, const int *source, int pe);
__device__ ATTR_NO_INLINE int rocshmem_int_g(
const int *source, int pe);
__host__ int rocshmem_ctx_int_g(
rocshmem_ctx_t ctx, const int *source, int pe);
__host__ int rocshmem_int_g(
const int *source, int pe);
__device__ ATTR_NO_INLINE long rocshmem_ctx_long_g(
rocshmem_ctx_t ctx, const long *source, int pe);
__device__ ATTR_NO_INLINE long rocshmem_long_g(
const long *source, int pe);
__host__ long rocshmem_ctx_long_g(
rocshmem_ctx_t ctx, const long *source, int pe);
__host__ long rocshmem_long_g(
const long *source, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_ctx_longlong_g(
rocshmem_ctx_t ctx, const long long *source, int pe);
__device__ ATTR_NO_INLINE long long rocshmem_longlong_g(
const long long *source, int pe);
__host__ long long rocshmem_ctx_longlong_g(
rocshmem_ctx_t ctx, const long long *source, int pe);
__host__ long long rocshmem_longlong_g(
const long long *source, int pe);
__device__ ATTR_NO_INLINE unsigned char rocshmem_ctx_uchar_g(
rocshmem_ctx_t ctx, const unsigned char *source, int pe);
__device__ ATTR_NO_INLINE unsigned char rocshmem_uchar_g(
const unsigned char *source, int pe);
__host__ unsigned char rocshmem_ctx_uchar_g(
rocshmem_ctx_t ctx, const unsigned char *source, int pe);
__host__ unsigned char rocshmem_uchar_g(
const unsigned char *source, int pe);
__device__ ATTR_NO_INLINE unsigned short rocshmem_ctx_ushort_g(
rocshmem_ctx_t ctx, const unsigned short *source, int pe);
__device__ ATTR_NO_INLINE unsigned short rocshmem_ushort_g(
const unsigned short *source, int pe);
__host__ unsigned short rocshmem_ctx_ushort_g(
rocshmem_ctx_t ctx, const unsigned short *source, int pe);
__host__ unsigned short rocshmem_ushort_g(
const unsigned short *source, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_ctx_uint_g(
rocshmem_ctx_t ctx, const unsigned int *source, int pe);
__device__ ATTR_NO_INLINE unsigned int rocshmem_uint_g(
const unsigned int *source, int pe);
__host__ unsigned int rocshmem_ctx_uint_g(
rocshmem_ctx_t ctx, const unsigned int *source, int pe);
__host__ unsigned int rocshmem_uint_g(
const unsigned int *source, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ctx_ulong_g(
rocshmem_ctx_t ctx, const unsigned long *source, int pe);
__device__ ATTR_NO_INLINE unsigned long rocshmem_ulong_g(
const unsigned long *source, int pe);
__host__ unsigned long rocshmem_ctx_ulong_g(
rocshmem_ctx_t ctx, const unsigned long *source, int pe);
__host__ unsigned long rocshmem_ulong_g(
const unsigned long *source, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ctx_ulonglong_g(
rocshmem_ctx_t ctx, const unsigned long long *source, int pe);
__device__ ATTR_NO_INLINE unsigned long long rocshmem_ulonglong_g(
const unsigned long long *source, int pe);
__host__ unsigned long long rocshmem_ctx_ulonglong_g(
rocshmem_ctx_t ctx, const unsigned long long *source, int pe);
__host__ unsigned long long rocshmem_ulonglong_g(
const unsigned long long *source, int pe);
/**
* @name SHMEM_PUT_NBI
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_nbi(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_nbi(
float *dest, const float *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_float_put_nbi(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__host__ void rocshmem_float_put_nbi(
float *dest, const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_nbi(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_nbi(
double *dest, const double *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_double_put_nbi(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__host__ void rocshmem_double_put_nbi(
double *dest, const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_nbi(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_nbi(
char *dest, const char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_char_put_nbi(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__host__ void rocshmem_char_put_nbi(
char *dest, const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_nbi(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_nbi(
signed char *dest, const signed char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_schar_put_nbi(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__host__ void rocshmem_schar_put_nbi(
signed char *dest, const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_nbi(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_nbi(
short *dest, const short *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_short_put_nbi(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__host__ void rocshmem_short_put_nbi(
short *dest, const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_nbi(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_nbi(
int *dest, const int *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_int_put_nbi(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__host__ void rocshmem_int_put_nbi(
int *dest, const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_nbi(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_nbi(
long *dest, const long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_long_put_nbi(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__host__ void rocshmem_long_put_nbi(
long *dest, const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_nbi(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_nbi(
long long *dest, const long long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_longlong_put_nbi(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__host__ void rocshmem_longlong_put_nbi(
long long *dest, const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_nbi(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_nbi(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_uchar_put_nbi(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__host__ void rocshmem_uchar_put_nbi(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_nbi(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_nbi(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ushort_put_nbi(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__host__ void rocshmem_ushort_put_nbi(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_nbi(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_nbi(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_uint_put_nbi(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__host__ void rocshmem_uint_put_nbi(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_nbi(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_nbi(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ulong_put_nbi(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__host__ void rocshmem_ulong_put_nbi(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_nbi(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_nbi(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ulonglong_put_nbi(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__host__ void rocshmem_ulonglong_put_nbi(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx,
void *dest,
const void *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi(void *dest,
const void *source,
size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* _host__ rocshmem_quiet() if completion notification is required.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__host__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe);
__host__ void rocshmem_putmem_nbi(void *dest, const void *source,
size_t nelems, int pe);
/**
* @name SHMEM_GET_NBI
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller will
* return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_nbi(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_get_nbi(
float *dest, const float *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_float_get_nbi(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__host__ void rocshmem_float_get_nbi(float *dest,
const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_nbi(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_get_nbi(
double *dest, const double *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_double_get_nbi(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__host__ void rocshmem_double_get_nbi(double *dest,
const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_nbi(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_get_nbi(
char *dest, const char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_char_get_nbi(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__host__ void rocshmem_char_get_nbi(char *dest,
const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_nbi(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_get_nbi(
signed char *dest, const signed char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_schar_get_nbi(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__host__ void rocshmem_schar_get_nbi(signed char *dest,
const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_nbi(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_get_nbi(
short *dest, const short *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_short_get_nbi(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__host__ void rocshmem_short_get_nbi(short *dest,
const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_nbi(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_get_nbi(
int *dest, const int *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_int_get_nbi(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__host__ void rocshmem_int_get_nbi(int *dest,
const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_nbi(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_get_nbi(
long *dest, const long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_long_get_nbi(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__host__ void rocshmem_long_get_nbi(long *dest,
const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_nbi(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_get_nbi(
long long *dest, const long long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_longlong_get_nbi(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__host__ void rocshmem_longlong_get_nbi(long long *dest,
const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_nbi(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_get_nbi(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_uchar_get_nbi(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__host__ void rocshmem_uchar_get_nbi(unsigned char *dest,
const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_nbi(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_get_nbi(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ushort_get_nbi(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__host__ void rocshmem_ushort_get_nbi(unsigned short *dest,
const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_nbi(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_get_nbi(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_uint_get_nbi(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__host__ void rocshmem_uint_get_nbi(unsigned int *dest,
const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_nbi(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_get_nbi(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ulong_get_nbi(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__host__ void rocshmem_ulong_get_nbi(unsigned long *dest,
const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_nbi(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_nbi(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
__host__ void rocshmem_ctx_ulonglong_get_nbi(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__host__ void rocshmem_ulonglong_get_nbi(unsigned long long *dest,
const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller will
* return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx,
void *dest,
const void *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi(void *dest,
const void *source,
size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller will
* return as soon as the request is posted. The caller must call
* __host__ rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__host__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest,
const void *source, size_t nelems,
int pe);
__host__ void rocshmem_getmem_nbi(void *dest, const void *source,
size_t nelems, int pe);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
namespace rocshmem {
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in a wave must collectively participate
* in the call using the same arguments
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_wave(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_wave(
float *dest, const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_wave(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_wave(
double *dest, const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_wave(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_wave(
char *dest, const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_wave(
signed char *dest, const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_wave(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_wave(
short *dest, const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_wave(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_wave(
int *dest, const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_wave(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_wave(
long *dest, const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_wave(
long long *dest, const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-workgroup
* (WG) granularity. However, All threads in a WG must collectively participate
* in the call using the same arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_wg(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_wg(
float *dest, const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_wg(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_wg(
double *dest, const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_wg(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_wg(
char *dest, const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_wg(
signed char *dest, const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_wg(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_wg(
short *dest, const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_wg(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_wg(
int *dest, const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_wg(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_wg(
long *dest, const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_wg(
long long *dest, const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in a wave must participate in the
* call using the same parameters.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_wave(void *dest,
const void *source,
size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-workgroup
* (WG) granularity. However, all threads in the workgroup must participate in
* the call using the same parameters.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx,
void *dest,
const void *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_wg(void *dest,
const void *source,
size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must participate in the
* call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_wave(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_get_wave(
float *dest, const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_wave(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_get_wave(
double *dest, const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_wave(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_get_wave(
char *dest, const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_get_wave(
signed char *dest, const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_wave(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_get_wave(
short *dest, const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_wave(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_get_wave(
int *dest, const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_wave(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_get_wave(
long *dest, const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_get_wave(
long long *dest, const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_get_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_get_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_get_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_get_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the workgroup must participate in
* the call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_wg(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_get_wg(
float *dest, const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_wg(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_get_wg(
double *dest, const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_wg(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_get_wg(
char *dest, const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_get_wg(
signed char *dest, const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_wg(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_get_wg(
short *dest, const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_wg(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_get_wg(
int *dest, const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_wg(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_get_wg(
long *dest, const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_get_wg(
long long *dest, const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_get_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_get_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_get_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_get_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in a the wave must participate in the
* call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_getmem_wave(void *dest,
const void *source,
size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-workgroup
* (WG) granularity. However, all threads in the workgroup must participate
* in the call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx,
void *dest,
const void *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_getmem_wg(void *dest,
const void *source,
size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_nbi_wave(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_nbi_wave(
float *dest, const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_nbi_wave(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_nbi_wave(
double *dest, const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_nbi_wave(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_nbi_wave(
char *dest, const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_nbi_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_nbi_wave(
signed char *dest, const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_nbi_wave(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_nbi_wave(
short *dest, const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_nbi_wave(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_nbi_wave(
int *dest, const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_nbi_wave(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_nbi_wave(
long *dest, const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_nbi_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_nbi_wave(
long long *dest, const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_nbi_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_nbi_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_nbi_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_nbi_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_nbi_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_nbi_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_nbi_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_nbi_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_nbi_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_nbi_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the WG must call in with the sameo
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_nbi_wg(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_nbi_wg(
float *dest, const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_nbi_wg(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_nbi_wg(
double *dest, const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_nbi_wg(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_nbi_wg(
char *dest, const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_nbi_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_nbi_wg(
signed char *dest, const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_nbi_wg(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_nbi_wg(
short *dest, const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_nbi_wg(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_nbi_wg(
int *dest, const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_nbi_wg(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_nbi_wg(
long *dest, const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_nbi_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_nbi_wg(
long long *dest, const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_nbi_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_nbi_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_nbi_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_nbi_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_nbi_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_nbi_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_nbi_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_nbi_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_nbi_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_nbi_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in a wave must call in with the same
* parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wave(void *dest,
const void *source,
size_t nelems,
int pe);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in a WG must call in with the same
* parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_nbi_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_nbi_wg(void *dest,
const void *source,
size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_nbi_wave(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_get_nbi_wave(
float *dest, const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_nbi_wave(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_get_nbi_wave(
double *dest, const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_nbi_wave(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_get_nbi_wave(
char *dest, const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_nbi_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_get_nbi_wave(
signed char *dest, const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_nbi_wave(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_get_nbi_wave(
short *dest, const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_nbi_wave(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_get_nbi_wave(
int *dest, const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_nbi_wave(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_get_nbi_wave(
long *dest, const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_nbi_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_get_nbi_wave(
long long *dest, const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_nbi_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_get_nbi_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_nbi_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_get_nbi_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_nbi_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_get_nbi_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_nbi_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_get_nbi_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_nbi_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_nbi_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the WG must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_get_nbi_wg(
rocshmem_ctx_t ctx, float *dest, const float *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_get_nbi_wg(
float *dest, const float *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_get_nbi_wg(
rocshmem_ctx_t ctx, double *dest, const double *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_get_nbi_wg(
double *dest, const double *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_get_nbi_wg(
rocshmem_ctx_t ctx, char *dest, const char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_get_nbi_wg(
char *dest, const char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_get_nbi_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_get_nbi_wg(
signed char *dest, const signed char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_get_nbi_wg(
rocshmem_ctx_t ctx, short *dest, const short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_get_nbi_wg(
short *dest, const short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_get_nbi_wg(
rocshmem_ctx_t ctx, int *dest, const int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_get_nbi_wg(
int *dest, const int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_get_nbi_wg(
rocshmem_ctx_t ctx, long *dest, const long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_get_nbi_wg(
long *dest, const long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_get_nbi_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_get_nbi_wg(
long long *dest, const long long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_get_nbi_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_get_nbi_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_get_nbi_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_get_nbi_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_get_nbi_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_get_nbi_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_get_nbi_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_get_nbi_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_get_nbi_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_get_nbi_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, int pe);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wave(void *dest,
const void *source,
size_t nelems,
int pe);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the WG must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_getmem_nbi_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe);
__device__ ATTR_NO_INLINE void rocshmem_getmem_nbi_wg(void *dest,
const void *source,
size_t nelems, int pe);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
namespace rocshmem {
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wg(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wg(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wg(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wg(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wg(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wg(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wg(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wg(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wg(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wg(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wg(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wg(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wg(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wg(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wg(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wave(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wave(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wave(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wave(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wave(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wave(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wave(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wave(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wave(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wave(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wave(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wave(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wave(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wave(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wave(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wg(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wg(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wg(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wg(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wg(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wg(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wg(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wg(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wg(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wg(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wg(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wg(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wg(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wg(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wg(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wave(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wave(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wave(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wave(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wave(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wave(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wave(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wave(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wave(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wave(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wave(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wave(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wave(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wave(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wave(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
namespace rocshmem {
#ifdef USE_FUNC_CALL
#define ATTR_NO_INLINE __attribute__((noinline))
#else
#define ATTR_NO_INLINE
#endif
enum ROCSHMEM_STATUS {
ROCSHMEM_SUCCESS = 0,
ROCSHMEM_ERROR = 1,
};
enum ROCSHMEM_OP {
ROCSHMEM_SUM,
ROCSHMEM_MAX,
ROCSHMEM_MIN,
ROCSHMEM_PROD,
ROCSHMEM_AND,
ROCSHMEM_OR,
ROCSHMEM_XOR,
ROCSHMEM_REPLACE
};
enum ROCSHMEM_SIGNAL_OPS {
ROCSHMEM_SIGNAL_SET,
ROCSHMEM_SIGNAL_ADD,
};
/**
* @brief Types defined for rocshmem_wait() operations.
*/
enum rocshmem_cmps {
ROCSHMEM_CMP_EQ,
ROCSHMEM_CMP_NE,
ROCSHMEM_CMP_GT,
ROCSHMEM_CMP_GE,
ROCSHMEM_CMP_LT,
ROCSHMEM_CMP_LE,
};
enum rocshmem_thread_ops {
ROCSHMEM_THREAD_SINGLE,
ROCSHMEM_THREAD_FUNNELED,
ROCSHMEM_THREAD_WG_FUNNELED,
ROCSHMEM_THREAD_SERIALIZED,
ROCSHMEM_THREAD_MULTIPLE
};
/**
* @brief Bitwise flags to mask configuration parameters.
*/
enum rocshmem_team_configs {
ROCSHMEM_TEAM_DEFAULT_CONFIGS,
ROCSHMEM_TEAM_NUM_CONTEXTS
};
typedef struct {
int num_contexts;
} rocshmem_team_config_t;
constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024;
constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256;
constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256;
// Internally calls sync function, which matches barrier implementation
constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE;
constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1;
constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE;
constexpr size_t ROCSHMEM_SYNC_VALUE = 0;
const int ROCSHMEM_CTX_ZERO = 0;
const int ROCSHMEM_CTX_NOSTORE = 1;
const int ROCSHMEM_CTX_SERIALIZED = 2;
const int ROCSHMEM_CTX_WG_PRIVATE = 4;
const int ROCSHMEM_CTX_SHARED = 8;
/**
* @brief GPU side OpenSHMEM context created from each work-groups'
* rocshmem_wg_handle_t
*/
typedef struct rocshmem_ctx{
void *ctx_opaque;
void *team_opaque;
__host__ __device__ bool operator==(const struct rocshmem_ctx& other) const {
return (ctx_opaque == other.ctx_opaque &&
team_opaque == other.team_opaque);
}
__host__ __device__ bool operator!=(const struct rocshmem_ctx& other) const {
return !(*this == other);
}
} rocshmem_ctx_t;
/**
* Shmem default context.
*/
extern "C" __device__ rocshmem_ctx_t __attribute__((visibility("default"))) ROCSHMEM_CTX_DEFAULT;
/**
* A value corresponding to an invalid communication context. This value can be
* used to initialize or update context handles to indicate that they do not
* reference a valid context. When managed in this way, applications can use an
* equality comparison to test whether a given context handle references a
* valid context.
*/
extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_INVALID;
/**
* Used internally to set default context.
*/
void set_internal_ctx(rocshmem_ctx_t *ctx);
typedef uint64_t *rocshmem_team_t;
extern rocshmem_team_t ROCSHMEM_TEAM_WORLD;
const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr;
/**
* @brief Data structure defining the unqiueId
*/
/// Unique ID for a process. This is a ROCSHMEM_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
#define ROCSHMEM_UNIQUE_ID_BYTES 128
using rocshmem_uniqueid_t = std::array<uint8_t, ROCSHMEM_UNIQUE_ID_BYTES>;
/**
* @brief Data structure used for attribute based
* initialization
*/
struct rocshmem_init_attr_t {
int32_t rank;
int32_t nranks;
rocshmem_uniqueid_t uid;
void* mpi_comm;
};
typedef struct rocshmem_init_attr_t rocshmem_init_attr_t;
constexpr unsigned int ROCSHMEM_INIT_WITH_MPI_COMM = 0;
constexpr unsigned int ROCSHMEM_INIT_WITH_UNIQUEID = 1;
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
/* #undef DEBUG */
/* #undef PROFILE */
/* #undef USE_RO */
/* #undef USE_IPC */
#define USE_GDA
/* #undef USE_THREADS */
/* #undef USE_SHARED_CTX */
/* #undef USE_WF_COAL */
#define USE_HEAP_DEVICE_FINEGRAIN
/* #undef USE_HEAP_DEVICE_UNCACHED */
/* #undef USE_HEAP_DEVICE_COARSEGRAIN */
/* #undef USE_HEAP_MANAGED */
/* #undef USE_HEAP_HOST_HIP */
/* #undef USE_HEAP_HOST */
#define USE_ALLOC_DLMALLOC
/* #undef USE_ALLOC_POW2BINS */
/* #undef USE_FUNC_CALL */
/* #undef USE_SINGLE_NODE */
/* #undef USE_HDP_FLUSH */
/* #undef USE_HDP_FLUSH_HOST_SIDE */
/* #undef GDA_IONIC */
/* #undef GDA_BNXT */
#define GDA_MLX5
#define HAVE_EXTERNAL_MPI
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_DEBUG_HPP
#define LIBRARY_INCLUDE_DEBUG_HPP
namespace rocshmem {
void debug_print_cq(int dest_pe, int src_wg, int cqe_index);
void debug_print_sq(int dest_pe, int src_wg, int index_wqe);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_DEBUG_HPP
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_MPI_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_MPI_HPP
#if defined(HAVE_EXTERNAL_MPI)
#include <mpi.h>
#endif
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
#if !defined(MPI_VERSION)
// Open MPI based values for the constants/handles etc.
// Even though we did not include an external MPI header file
// The includer may have (e.g., a unit test).
typedef void* MPI_Comm;
typedef void* MPI_Win;
typedef void* MPI_Group;
typedef void* MPI_Op;
typedef void* MPI_Datatype;
typedef void* MPI_Request;
typedef void* MPI_Info;
struct ompi_status_public_t {
int MPI_SOURCE;
int MPI_TAG;
int MPI_ERROR;
int _cancelled;
size_t _ucount;
};
typedef struct ompi_status_public_t MPI_Status;
#define MPI_Aint uint64_t
#define MPI_UNDEFINED -32766
#define MPI_THREAD_MULTIPLE 3
#define MPI_SUCCESS 0
#define MPI_IN_PLACE (void*)1
#define MPI_MODE_NOCHECK 1
#define MPI_COMM_TYPE_SHARED 0
#define MPI_Aint_diff(addr1, addr2) ((MPI_Aint) ((char *) (addr1) - (char *) (addr2)))
struct ompi_internal_symbols_t {
void *ompi_mpi_comm_world;
void *ompi_mpi_comm_null;
void *ompi_request_null;
void *ompi_mpi_info_null;
void *ompi_mpi_datatype_null;
void *ompi_mpi_op_max;
void *ompi_mpi_op_min;
void *ompi_mpi_op_sum;
void *ompi_mpi_op_prod;
void *ompi_mpi_op_band;
void *ompi_mpi_op_bor;
void *ompi_mpi_op_bxor;
void *ompi_mpi_op_replace;
void *ompi_mpi_op_no_op;
void *ompi_mpi_char;
void *ompi_mpi_unsigned_char;
void *ompi_mpi_signed_char;
void *ompi_mpi_short;
void *ompi_mpi_unsigned_short;
void *ompi_mpi_int;
void *ompi_mpi_unsigned;
void *ompi_mpi_long;
void *ompi_mpi_unsigned_long;
void *ompi_mpi_long_long_int;
void *ompi_mpi_unsigned_long_long;
void *ompi_mpi_float;
void *ompi_mpi_double;
void *ompi_mpi_long_double;
};
extern struct ompi_internal_symbols_t ompi_symbols_;
#define OMPI_PREDEFINED_GLOBAL(type, global) (static_cast<type> (global))
#define MPI_COMM_WORLD OMPI_PREDEFINED_GLOBAL(MPI_Comm, ompi_symbols_.ompi_mpi_comm_world)
#define MPI_COMM_NULL OMPI_PREDEFINED_GLOBAL(MPI_Comm, ompi_symbols_.ompi_mpi_comm_null)
#define MPI_REQUEST_NULL OMPI_PREDEFINED_GLOBAL(MPI_Request, ompi_symbols_.ompi_request_null)
#define MPI_WIN_NULL OMPI_PREDEFINED_GLOBAL(MPI_Win, ompi_symbols_.ompi_mpi_win_null)
#define MPI_INFO_NULL OMPI_PREDEFINED_GLOBAL(MPI_Info, ompi_symbols_.ompi_mpi_info_null)
#define MPI_MAX OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_max)
#define MPI_MIN OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_min)
#define MPI_SUM OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_sum)
#define MPI_PROD OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_prod)
#define MPI_BAND OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_band)
#define MPI_BOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_bor)
#define MPI_BXOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_bxor)
#define MPI_REPLACE OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_replace)
#define MPI_NO_OP OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_no_op)
#define MPI_DATATYPE_NULL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_datatype_null)
#define MPI_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_char)
#define MPI_UNSIGNED_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_char)
#define MPI_SIGNED_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_signed_char)
#define MPI_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_short)
#define MPI_UNSIGNED_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_short)
#define MPI_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_int)
#define MPI_UNSIGNED OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned)
#define MPI_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_long)
#define MPI_UNSIGNED_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_long)
#define MPI_LONG_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_long_long_int)
#define MPI_UNSIGNED_LONG_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_long_long)
#define MPI_FLOAT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_float)
#define MPI_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_double)
#define MPI_LONG_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_long_double)
#endif //!defined(MPI_VERSION)
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif //LIBRARY_INCLUDE_ROCSHMEM_MPI_HPP
MIT License
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
SPDX-License-Identifier: MIT
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
rocshmem @ f5a87af2
Subproject commit f5a87af2671b6daaea16ae766ca97db867ef996c
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment