Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
DeepEP
Commits
a117adf8
Commit
a117adf8
authored
Nov 19, 2025
by
lishen
Browse files
Merge branch 'main' of
http://112.11.119.99:10068//dcutoolkit/deeplearing/DeepEP
parents
b33659dd
b705eeca
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
4 additions
and
6888 deletions
+4
-6888
.gitmodules
.gitmodules
+3
-0
rocshmem_dir/bin/rocshmem_info
rocshmem_dir/bin/rocshmem_info
+0
-0
rocshmem_dir/include/rocshmem/rocshmem.hpp
rocshmem_dir/include/rocshmem/rocshmem.hpp
+0
-566
rocshmem_dir/include/rocshmem/rocshmem_AMO.hpp
rocshmem_dir/include/rocshmem/rocshmem_AMO.hpp
+0
-1583
rocshmem_dir/include/rocshmem/rocshmem_COLL.hpp
rocshmem_dir/include/rocshmem/rocshmem_COLL.hpp
+0
-780
rocshmem_dir/include/rocshmem/rocshmem_P2P_SYNC.hpp
rocshmem_dir/include/rocshmem/rocshmem_P2P_SYNC.hpp
+0
-664
rocshmem_dir/include/rocshmem/rocshmem_RMA.hpp
rocshmem_dir/include/rocshmem/rocshmem_RMA.hpp
+0
-1210
rocshmem_dir/include/rocshmem/rocshmem_RMA_X.hpp
rocshmem_dir/include/rocshmem/rocshmem_RMA_X.hpp
+0
-1038
rocshmem_dir/include/rocshmem/rocshmem_SIG_OP.hpp
rocshmem_dir/include/rocshmem/rocshmem_SIG_OP.hpp
+0
-625
rocshmem_dir/include/rocshmem/rocshmem_common.hpp
rocshmem_dir/include/rocshmem/rocshmem_common.hpp
+0
-172
rocshmem_dir/include/rocshmem/rocshmem_config.h
rocshmem_dir/include/rocshmem/rocshmem_config.h
+0
-48
rocshmem_dir/include/rocshmem/rocshmem_debug.hpp
rocshmem_dir/include/rocshmem/rocshmem_debug.hpp
+0
-36
rocshmem_dir/include/rocshmem/rocshmem_mpi.hpp
rocshmem_dir/include/rocshmem/rocshmem_mpi.hpp
+0
-143
rocshmem_dir/lib/librocshmem.a
rocshmem_dir/lib/librocshmem.a
+0
-0
rocshmem_dir/share/doc/rocshmem/LICENSE.md
rocshmem_dir/share/doc/rocshmem/LICENSE.md
+0
-23
third-party/rocshmem
third-party/rocshmem
+1
-0
No files found.
.gitmodules
0 → 100644
View file @
a117adf8
[submodule "third-party/rocshmem"]
path = third-party/rocshmem
url = http://112.11.119.99:10068/dcutoolkit/deeplearing/rocshmem.git
rocshmem_dir/bin/rocshmem_info
deleted
100755 → 0
View file @
b33659dd
File deleted
rocshmem_dir/include/rocshmem/rocshmem.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_HPP
#include <hip/hip_runtime.h>
#include "rocshmem_config.h"
#include "rocshmem_common.hpp"
#include "rocshmem_RMA.hpp"
#include "rocshmem_AMO.hpp"
#include "rocshmem_SIG_OP.hpp"
#include "rocshmem_COLL.hpp"
#include "rocshmem_P2P_SYNC.hpp"
#include "rocshmem_RMA_X.hpp"
#if defined(HAVE_EXTERNAL_MPI)
#include <mpi.h>
#endif
/**
* @file rocshmem.hpp
* @brief Public header for rocSHMEM device and host libraries.
*
* This file contains all the callable functions and data structures for both
* the device-side runtime and host-side runtime.
*
* The comments on these functions are sparse, but the semantics are the same
* as those implemented in OpenSHMEM unless otherwise documented. Please see
* the OpenSHMEM 1.4 standards documentation for more details:
*
* http://openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf
*/
namespace
rocshmem
{
constexpr
char
VERSION
[]
=
"3.0.0"
;
/******************************************************************************
**************************** HOST INTERFACE **********************************
*****************************************************************************/
#if defined(HAVE_EXTERNAL_MPI)
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer.
*
* @param[in] comm MPI Communicator that rocSHMEM will be using
* If MPI_COMM_NULL, rocSHMEM will be using MPI_COMM_WORLD
*/
[[
deprecated
]]
__host__
void
rocshmem_init
(
MPI_Comm
comm
);
#endif
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer.
* This is equivalent to the previous function, using implicitely
* MPI_COMM_WORLD for initialization
*/
__host__
void
rocshmem_init
(
void
);
/**
* @brief Query rocSHMEM context from host API
*
* @param[out] ctx Returns ROCSHMEM_CTX_DEFAULT device pointer that users
* can query from one instance of rocshmem host library and
* use use later for dynamic module initialization in
* kernel bitcode device library in the same application
*/
__host__
void
*
rocshmem_get_device_ctx
();
/**
* @brief Query rocSHMEM remote symmetric heap pointer
*
* @param[in] dest local symmetric heap allocation pointer for current pe/device
*
* @param[in] pe remote PE
*
* @param[out] ptr Returns remote symmetric heap device pointer from host-side API.
* This can be used to issue load/store from custom kernels
* instead of using rocshmem device side get/put APIs for RMA operations.
*/
__host__
void
*
rocshmem_ptr
(
const
void
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
*
rocshmem_ptr
(
const
void
*
dest
,
int
pe
);
#if defined(HAVE_EXTERNAL_MPI)
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer
* with an attempt to enable the requested thread support.
*
* @param[in] requested Requested thread mode (from rocshmem_thread_ops)
* for host-facing functions.
* @param[out] provided Thread mode selected by the runtime. May not be equal
* to requested thread mode.
* @param[in] comm (Optional) MPI Communicator that rocSHMEM will be using
* If MPI_COMM_NULL, rocSHMEM will be using MPI_COMM_WORLD
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
[[
deprecated
]]
__host__
int
rocshmem_init_thread
(
int
requested
,
int
*
provided
,
MPI_Comm
comm
);
#endif
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer
* using the provided mode and attributes
*
* @param[in] flags initialization method to be used.
* Valid values are ROCSHMEM_INIT_WITH_UNIQUEID and
* ROCSHMEM_INIT_WITH_MPI_COMM
* @param[in] attr attribute structure specifying input characteristics
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
__host__
int
rocshmem_init_attr
(
unsigned
int
flags
,
rocshmem_init_attr_t
*
attr
);
/**
* @brief Return a uniqueID
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
__host__
int
rocshmem_get_uniqueid
(
rocshmem_uniqueid_t
*
uid
);
/**
* @brief Initalizes the rocshmem_init_attr_t struct
*
* @param[in] rank rank of the calling process
* @param[in] nranks number of pes
* @param[in] uid unique ID used to identify the group processes.
* All processes that
* @param[out] attr attribute structure to be passed to rocshmem_init_attr
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
__host__
int
rocshmem_set_attr_uniqueid_args
(
int
rank
,
int
nranks
,
rocshmem_uniqueid_t
*
uid
,
rocshmem_init_attr_t
*
attr
);
/**
* @brief Query the thread mode used by the runtime.
*
* @param[out] provided Thread mode the runtime is operating in.
*
* @return void.
*/
__host__
void
rocshmem_query_thread
(
int
*
provided
);
/**
* @brief Function that dumps internal stats to stdout.
*/
__host__
void
rocshmem_dump_stats
();
/**
* @brief Reset all internal stats.
*/
__host__
void
rocshmem_reset_stats
();
/**
* @brief Finalize the rocSHMEM runtime.
*/
__host__
void
rocshmem_finalize
();
/**
* @brief Allocate memory of \p size bytes from the symmetric heap.
* This is a collective operation and must be called by all PEs.
*
* @param[in] size Memory allocation size in bytes.
*
* @return A pointer to the allocated memory on the symmetric heap.
*
* @todo Return error code instead of ptr.
*/
__host__
void
*
rocshmem_malloc
(
size_t
size
);
/**
* @brief Free a memory allocation from the symmetric heap.
* This is a collective operation and must be called by all PEs.
*
* @param[in] ptr Pointer to previously allocated memory on the symmetric heap.
*/
__host__
void
rocshmem_free
(
void
*
ptr
);
/**
* @brief Query for the number of PEs.
*
* @return Number of PEs.
*/
__host__
int
rocshmem_n_pes
();
/**
* @brief Query the PE ID of the caller.
*
* @return PE ID of the caller.
*/
__host__
int
rocshmem_my_pe
();
/**
* @brief Creates an OpenSHMEM context.
*
* @param[in] options Options for context creation. Ignored in current design.
* @param[out] ctx Context handle.
*
* @return Zero on success and nonzero otherwise.
*/
__host__
int
rocshmem_ctx_create
(
int64_t
options
,
rocshmem_ctx_t
*
ctx
);
/**
* @brief Destroys an OpenSHMEM context.
*
* @param[out] ctx Context handle.
*
* @return void.
*/
__host__
void
rocshmem_ctx_destroy
(
rocshmem_ctx_t
ctx
);
/**
* @brief Translate the PE in src_team to that in dest_team.
*
* @param[in] src_team Handle of the team from which to translate
* @param[in] src_pe PE-of-interest's index in src_team
* @param[in] dest_team Handle of the team to which to translate
*
* @return PE of src_pe in dest_team. If any input is invalid
* or if src_pe is not in both source and destination
* teams, a value of -1 is returned.
*/
__host__
int
rocshmem_team_translate_pe
(
rocshmem_team_t
src_team
,
int
src_pe
,
rocshmem_team_t
dest_team
);
/**
* @brief Query the number of PEs in a team.
*
* @param[in] team The team to query PE ID in.
*
* @return Number of PEs in the provided team.
*/
__host__
int
rocshmem_team_n_pes
(
rocshmem_team_t
team
);
/**
* @brief Query the PE ID of the caller in a team.
*
* @param[in] team The team to query PE ID in.
*
* @return PE ID of the caller in the provided team.
*/
__host__
int
rocshmem_team_my_pe
(
rocshmem_team_t
team
);
/**
* @brief Create a new a team of PEs. Must be called by all PEs
* in the parent team.
*
* @param[in] parent_team The team to split from.
* @param[in] start The lowest PE number of the subset of the PEs
* from the parent team that will form the new
* team.
* @param[in] stide The stride between team PE members in the
* parent team that comprise the subset of PEs
* that will form the new team.
* @param[in] size The number of PEs in the new team.
* @param[in] config Pointer to the config parameters for the new
* team.
* @param[in] config_mask Bitwise mask representing parameters to use
* from config
* @param[out] new_team Pointer to the newly created team. If an error
* occurs during team creation, or if the PE in
* the parent team is not in the new team, the
* value will be ROCSHMEM_TEAM_INVALID.
*
* @return Zero upon successful team creation; non-zero if erroneous.
*/
__host__
int
rocshmem_team_split_strided
(
rocshmem_team_t
parent_team
,
int
start
,
int
stride
,
int
size
,
const
rocshmem_team_config_t
*
config
,
long
config_mask
,
rocshmem_team_t
*
new_team
);
/**
* @brief Destroy a team. Must be called by all PEs in the team.
* The user must destroy all private contexts created in the
* team before destroying this team. Otherwise, the behavior
* is undefined. This call will destroy only the shareable contexts
* created from the referenced team.
*
* @param[in] team The team to destroy. The behavior is undefined if
* the input team is ROCSHMEM_TEAM_WORLD or any other
* invalid team. If the input is ROCSHMEM_TEAM_INVALID,
* this function will not perform any operation.
*
* @return None.
*/
__host__
void
rocshmem_team_destroy
(
rocshmem_team_t
team
);
/**
* @brief Guarantees order between messages in this context in accordance with
* OpenSHMEM semantics.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__host__
void
rocshmem_ctx_fence
(
rocshmem_ctx_t
ctx
);
__host__
void
rocshmem_fence
();
/**
* @brief Completes all previous operations posted on the host.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__host__
void
rocshmem_ctx_quiet
(
rocshmem_ctx_t
ctx
);
__host__
void
rocshmem_quiet
();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* @return void
*/
__host__
void
rocshmem_barrier_all
();
/**
* @brief enqueues a collective barrier on given stream.
*
* @return void
*/
__host__
void
rocshmem_barrier_all_on_stream
(
hipStream_t
stream
);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* @return void
*/
__host__
void
rocshmem_sync_all
();
/**
* @brief allows any PE to force the termination of an entire program.
*
* @param[in] status The exit status from the main program.
*
* @return void
*/
__host__
void
rocshmem_global_exit
(
int
status
);
/******************************************************************************
**************************** DEVICE INTERFACE ********************************
*****************************************************************************/
/**
* @brief Initializes device-side rocSHMEM resources. Must be called before
* any threads in this work-group invoke other rocSHMEM functions.
*
* Must be called collectively by all threads in the work-group.
*
* @return void.
*/
[[
deprecated
]]
__device__
void
rocshmem_wg_init
();
/**
* @brief Finalizes device-side rocSHMEM resources. Must be called before
* work-group completion if the work-group also called rocshmem_wg_init().
*
* Must be called collectively by all threads in the work-group.
*
* @return void.
*/
[[
deprecated
]]
__device__
void
rocshmem_wg_finalize
();
/**
* @brief Initializes device-side rocSHMEM resources. Must be called before
* any threads in this work-group invoke other rocSHMEM functions. This is
* a variant of rocshmem_wg_init that allows the caller to request a
* threading mode.
*
* @param[in] requested Requested thread mode from rocshmem_thread_ops.
* @param[out] provided Thread mode selected by the runtime. May not be equal
* to requested thread mode.
*
* Must be called collectively by all threads in the work-group.
*
* @return void.
*/
[[
deprecated
]]
__device__
void
rocshmem_wg_init_thread
(
int
requested
,
int
*
provided
);
/**
* @brief Query the thread mode used by the runtime.
*
* @param[out] provided Thread mode the runtime is operating in.
*
* @return void.
*/
__device__
void
rocshmem_query_thread
(
int
*
provided
);
/**
* @brief Creates an OpenSHMEM context. By design, the context is private
* to the calling work-group.
*
* Must be called collectively by all threads in the work-group.
*
* @param[in] options Options for context creation. Ignored in current design.
* @param[out] ctx Context handle.
*
* @return All threads returns 0 if the context was created successfully. If any
* thread returns non-zero value, the operation failed and a higher number of
* `ROCSHMEM_MAX_NUM_CONTEXTS` is required.
*/
__device__
ATTR_NO_INLINE
int
rocshmem_wg_ctx_create
(
int64_t
options
,
rocshmem_ctx_t
*
ctx
);
__device__
ATTR_NO_INLINE
int
rocshmem_wg_team_create_ctx
(
rocshmem_team_t
team
,
long
options
,
rocshmem_ctx_t
*
ctx
);
/**
* @brief Destroys an OpenSHMEM context.
*
* Must be called collectively by all threads in the work-group.
*
* @param[in] The context to destroy.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_wg_ctx_destroy
(
rocshmem_ctx_t
*
ctx
);
/**
* @brief Guarantees order between messages in this context in accordance with
* OpenSHMEM semantics.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* rocSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_fence
(
rocshmem_ctx_t
ctx
);
__device__
ATTR_NO_INLINE
void
rocshmem_fence
();
/**
* @brief Guarantees order between messages in this context in accordance with
* OpenSHMEM semantics.
*
* This function is an extension as it is per PE. has same semantics as default
* API but it is per PE
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] pe destination pe.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_fence
(
rocshmem_ctx_t
ctx
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_fence
(
int
pe
);
/**
* @brief Completes all previous operations posted to this context.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* rocSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_quiet
(
rocshmem_ctx_t
ctx
);
__device__
ATTR_NO_INLINE
void
rocshmem_quiet
();
/**
* @brief Completes all previous operations posted to this context for PEs in the
* `target_pes` array.
*
* @param[in] ctx Context with which to perform this operation.
*
* @param[in] target_pes Address of target PE array where the operations need to be completed.
*
* @param[in] npes The number of PEs in the target PE array.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_pe_quiet
(
rocshmem_ctx_t
ctx
,
const
int
*
target_pes
,
size_t
npes
);
__device__
ATTR_NO_INLINE
void
rocshmem_pe_quiet
(
const
int
*
target_pes
,
size_t
npes
);
/**
* @brief Query the total number of PEs.
*
* Can be called per thread with no performance penalty.
*
* @param[in] ctx GPU side handle.
*
* @return Total number of PEs.
*/
__device__
int
rocshmem_ctx_n_pes
(
rocshmem_ctx_t
ctx
);
__device__
int
rocshmem_n_pes
();
/**
* @brief Query the PE ID of the caller.
*
* Can be called per thread with no performance penalty.
*
* @param[in] ctx GPU side handle
*
* @return PE ID of the caller.
*/
__device__
int
rocshmem_ctx_my_pe
(
rocshmem_ctx_t
ctx
);
__device__
int
rocshmem_my_pe
();
/**
* @brief Translate the PE in src_team to that in dest_team.
*
* @param[in] src_team Handle of the team from which to translate
* @param[in] src_pe PE-of-interest's index in src_team
* @param[in] dest_team Handle of the team to which to translate
*
* @return PE of src_pe in dest_team. If any input is invalid
* or if src_pe is not in both source and destination
* teams, a value of -1 is returned.
*/
__device__
int
rocshmem_team_translate_pe
(
rocshmem_team_t
src_team
,
int
src_pe
,
rocshmem_team_t
dest_team
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_threadfence_system
(
rocshmem_ctx_t
ctx
);
__device__
ATTR_NO_INLINE
void
rocshmem_threadfence_system
();
}
// namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_HPP
rocshmem_dir/include/rocshmem/rocshmem_AMO.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
namespace
rocshmem
{
/**
* @name SHMEM_ATOMIC_FETCH
* @brief Atomically return the value of \p dest to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return The value of \p dest.
*/
__device__
ATTR_NO_INLINE
float
rocshmem_ctx_float_atomic_fetch
(
rocshmem_ctx_t
ctx
,
float
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
float
rocshmem_float_atomic_fetch
(
float
*
source
,
int
pe
);
__host__
float
rocshmem_ctx_float_atomic_fetch
(
rocshmem_ctx_t
ctx
,
float
*
source
,
int
pe
);
__host__
float
rocshmem_float_atomic_fetch
(
float
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
double
rocshmem_ctx_double_atomic_fetch
(
rocshmem_ctx_t
ctx
,
double
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
double
rocshmem_double_atomic_fetch
(
double
*
source
,
int
pe
);
__host__
double
rocshmem_ctx_double_atomic_fetch
(
rocshmem_ctx_t
ctx
,
double
*
source
,
int
pe
);
__host__
double
rocshmem_double_atomic_fetch
(
double
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_atomic_fetch
(
rocshmem_ctx_t
ctx
,
int
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
int
rocshmem_int_atomic_fetch
(
int
*
source
,
int
pe
);
__host__
int
rocshmem_ctx_int_atomic_fetch
(
rocshmem_ctx_t
ctx
,
int
*
source
,
int
pe
);
__host__
int
rocshmem_int_atomic_fetch
(
int
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_ctx_long_atomic_fetch
(
rocshmem_ctx_t
ctx
,
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_long_atomic_fetch
(
long
*
source
,
int
pe
);
__host__
long
rocshmem_ctx_long_atomic_fetch
(
rocshmem_ctx_t
ctx
,
long
*
source
,
int
pe
);
__host__
long
rocshmem_long_atomic_fetch
(
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_ctx_longlong_atomic_fetch
(
rocshmem_ctx_t
ctx
,
long
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_longlong_atomic_fetch
(
long
long
*
source
,
int
pe
);
__host__
long
long
rocshmem_ctx_longlong_atomic_fetch
(
rocshmem_ctx_t
ctx
,
long
long
*
source
,
int
pe
);
__host__
long
long
rocshmem_longlong_atomic_fetch
(
long
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_ctx_uint_atomic_fetch
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_uint_atomic_fetch
(
unsigned
int
*
source
,
int
pe
);
__host__
unsigned
int
rocshmem_ctx_uint_atomic_fetch
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
source
,
int
pe
);
__host__
unsigned
int
rocshmem_uint_atomic_fetch
(
unsigned
int
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ctx_ulong_atomic_fetch
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ulong_atomic_fetch
(
unsigned
long
*
source
,
int
pe
);
__host__
unsigned
long
rocshmem_ctx_ulong_atomic_fetch
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
source
,
int
pe
);
__host__
unsigned
long
rocshmem_ulong_atomic_fetch
(
unsigned
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ulonglong_atomic_fetch
(
unsigned
long
long
*
source
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
source
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ulonglong_atomic_fetch
(
unsigned
long
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_ctx_int32_atomic_fetch
(
rocshmem_ctx_t
ctx
,
int32_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_int32_atomic_fetch
(
int32_t
*
source
,
int
pe
);
__host__
int32_t
rocshmem_ctx_int32_atomic_fetch
(
rocshmem_ctx_t
ctx
,
int32_t
*
source
,
int
pe
);
__host__
int32_t
rocshmem_int32_atomic_fetch
(
int32_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_ctx_int64_atomic_fetch
(
rocshmem_ctx_t
ctx
,
int64_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_int64_atomic_fetch
(
int64_t
*
source
,
int
pe
);
__host__
int64_t
rocshmem_ctx_int64_atomic_fetch
(
rocshmem_ctx_t
ctx
,
int64_t
*
source
,
int
pe
);
__host__
int64_t
rocshmem_int64_atomic_fetch
(
int64_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_ctx_uint32_atomic_fetch
(
rocshmem_ctx_t
ctx
,
uint32_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_uint32_atomic_fetch
(
uint32_t
*
source
,
int
pe
);
__host__
uint32_t
rocshmem_ctx_uint32_atomic_fetch
(
rocshmem_ctx_t
ctx
,
uint32_t
*
source
,
int
pe
);
__host__
uint32_t
rocshmem_uint32_atomic_fetch
(
uint32_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_ctx_uint64_atomic_fetch
(
rocshmem_ctx_t
ctx
,
uint64_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_uint64_atomic_fetch
(
uint64_t
*
source
,
int
pe
);
__host__
uint64_t
rocshmem_ctx_uint64_atomic_fetch
(
rocshmem_ctx_t
ctx
,
uint64_t
*
source
,
int
pe
);
__host__
uint64_t
rocshmem_uint64_atomic_fetch
(
uint64_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_ctx_size_atomic_fetch
(
rocshmem_ctx_t
ctx
,
size_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_size_atomic_fetch
(
size_t
*
source
,
int
pe
);
__host__
size_t
rocshmem_ctx_size_atomic_fetch
(
rocshmem_ctx_t
ctx
,
size_t
*
source
,
int
pe
);
__host__
size_t
rocshmem_size_atomic_fetch
(
size_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_fetch
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ptrdiff_atomic_fetch
(
ptrdiff_t
*
source
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_fetch
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
source
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ptrdiff_atomic_fetch
(
ptrdiff_t
*
source
,
int
pe
);
/**
* @name SHMEM_ATOMIC_SET
* @brief Atomically set the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_atomic_set
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
float
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_atomic_set
(
float
*
dest
,
float
value
,
int
pe
);
__host__
void
rocshmem_ctx_float_atomic_set
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
float
value
,
int
pe
);
__host__
void
rocshmem_float_atomic_set
(
float
*
dest
,
float
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_atomic_set
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
double
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_atomic_set
(
double
*
dest
,
double
value
,
int
pe
);
__host__
void
rocshmem_ctx_double_atomic_set
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
double
value
,
int
pe
);
__host__
void
rocshmem_double_atomic_set
(
double
*
dest
,
double
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_atomic_set
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_atomic_set
(
int
*
dest
,
int
value
,
int
pe
);
__host__
void
rocshmem_ctx_int_atomic_set
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__host__
void
rocshmem_int_atomic_set
(
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_atomic_set
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_atomic_set
(
long
*
dest
,
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_long_atomic_set
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__host__
void
rocshmem_long_atomic_set
(
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_atomic_set
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_atomic_set
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_longlong_atomic_set
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
void
rocshmem_longlong_atomic_set
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_atomic_set
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_atomic_set
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint_atomic_set
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_uint_atomic_set
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_atomic_set
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_atomic_set
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_atomic_set
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ulong_atomic_set
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_atomic_set
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_atomic_set
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_atomic_set
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ulonglong_atomic_set
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int32_atomic_set
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int32_atomic_set
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int32_atomic_set
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_int32_atomic_set
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int64_atomic_set
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int64_atomic_set
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int64_atomic_set
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_int64_atomic_set
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint32_atomic_set
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint32_atomic_set
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint32_atomic_set
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_uint32_atomic_set
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint64_atomic_set
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint64_atomic_set
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint64_atomic_set
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_uint64_atomic_set
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_size_atomic_set
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_size_atomic_set
(
size_t
*
dest
,
size_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_size_atomic_set
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
value
,
int
pe
);
__host__
void
rocshmem_size_atomic_set
(
size_t
*
dest
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ptrdiff_atomic_set
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ptrdiff_atomic_set
(
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_ptrdiff_atomic_set
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__host__
void
rocshmem_ptrdiff_atomic_set
(
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_COMPARE_SWAP
* @brief Atomically compares if the value in \p dest with \p cond is equal
* then put \p val in \p dest. The operation returns the older value of \p dest
* to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] cond The value to be compare with.
* @param[in] val The value to be atomically swapped.
* @param[in] pe PE of the remote process.
*
* @return The old value of \p dest.
*/
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
cond
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int
rocshmem_int_atomic_compare_swap
(
int
*
dest
,
int
cond
,
int
value
,
int
pe
);
__host__
int
rocshmem_ctx_int_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
cond
,
int
value
,
int
pe
);
__host__
int
rocshmem_int_atomic_compare_swap
(
int
*
dest
,
int
cond
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_ctx_long_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
cond
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_long_atomic_compare_swap
(
long
*
dest
,
long
cond
,
long
value
,
int
pe
);
__host__
long
rocshmem_ctx_long_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
cond
,
long
value
,
int
pe
);
__host__
long
rocshmem_long_atomic_compare_swap
(
long
*
dest
,
long
cond
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_ctx_longlong_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
cond
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_longlong_atomic_compare_swap
(
long
long
*
dest
,
long
long
cond
,
long
long
value
,
int
pe
);
__host__
long
long
rocshmem_ctx_longlong_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
cond
,
long
long
value
,
int
pe
);
__host__
long
long
rocshmem_longlong_atomic_compare_swap
(
long
long
*
dest
,
long
long
cond
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_ctx_uint_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
cond
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_uint_atomic_compare_swap
(
unsigned
int
*
dest
,
unsigned
int
cond
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_ctx_uint_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
cond
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_uint_atomic_compare_swap
(
unsigned
int
*
dest
,
unsigned
int
cond
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ctx_ulong_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
cond
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ulong_atomic_compare_swap
(
unsigned
long
*
dest
,
unsigned
long
cond
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ctx_ulong_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
cond
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ulong_atomic_compare_swap
(
unsigned
long
*
dest
,
unsigned
long
cond
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
cond
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ulonglong_atomic_compare_swap
(
unsigned
long
long
*
dest
,
unsigned
long
long
cond
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
cond
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ulonglong_atomic_compare_swap
(
unsigned
long
long
*
dest
,
unsigned
long
long
cond
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_ctx_int32_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
cond
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_int32_atomic_compare_swap
(
int32_t
*
dest
,
int32_t
cond
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_ctx_int32_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
cond
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_int32_atomic_compare_swap
(
int32_t
*
dest
,
int32_t
cond
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_ctx_int64_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
cond
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_int64_atomic_compare_swap
(
int64_t
*
dest
,
int64_t
cond
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_ctx_int64_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
cond
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_int64_atomic_compare_swap
(
int64_t
*
dest
,
int64_t
cond
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_ctx_uint32_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
cond
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_uint32_atomic_compare_swap
(
uint32_t
*
dest
,
uint32_t
cond
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_ctx_uint32_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
cond
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_uint32_atomic_compare_swap
(
uint32_t
*
dest
,
uint32_t
cond
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_ctx_uint64_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
cond
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_uint64_atomic_compare_swap
(
uint64_t
*
dest
,
uint64_t
cond
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_ctx_uint64_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
cond
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_uint64_atomic_compare_swap
(
uint64_t
*
dest
,
uint64_t
cond
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_ctx_size_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
cond
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_size_atomic_compare_swap
(
size_t
*
dest
,
size_t
cond
,
size_t
value
,
int
pe
);
__host__
size_t
rocshmem_ctx_size_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
cond
,
size_t
value
,
int
pe
);
__host__
size_t
rocshmem_size_atomic_compare_swap
(
size_t
*
dest
,
size_t
cond
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
cond
,
ptrdiff_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ptrdiff_atomic_compare_swap
(
ptrdiff_t
*
dest
,
ptrdiff_t
cond
,
ptrdiff_t
value
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_compare_swap
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
cond
,
ptrdiff_t
value
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ptrdiff_atomic_compare_swap
(
ptrdiff_t
*
dest
,
ptrdiff_t
cond
,
ptrdiff_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_SWAP
* @brief Atomically swap the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/
__device__
ATTR_NO_INLINE
float
rocshmem_ctx_float_atomic_swap
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
float
value
,
int
pe
);
__device__
ATTR_NO_INLINE
float
rocshmem_float_atomic_swap
(
float
*
dest
,
float
value
,
int
pe
);
__host__
float
rocshmem_ctx_float_atomic_swap
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
float
value
,
int
pe
);
__host__
float
rocshmem_float_atomic_swap
(
float
*
dest
,
float
value
,
int
pe
);
__device__
ATTR_NO_INLINE
double
rocshmem_ctx_double_atomic_swap
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
double
value
,
int
pe
);
__device__
ATTR_NO_INLINE
double
rocshmem_double_atomic_swap
(
double
*
dest
,
double
value
,
int
pe
);
__host__
double
rocshmem_ctx_double_atomic_swap
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
double
value
,
int
pe
);
__host__
double
rocshmem_double_atomic_swap
(
double
*
dest
,
double
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_atomic_swap
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int
rocshmem_int_atomic_swap
(
int
*
dest
,
int
value
,
int
pe
);
__host__
int
rocshmem_ctx_int_atomic_swap
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__host__
int
rocshmem_int_atomic_swap
(
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_ctx_long_atomic_swap
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_long_atomic_swap
(
long
*
dest
,
long
value
,
int
pe
);
__host__
long
rocshmem_ctx_long_atomic_swap
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__host__
long
rocshmem_long_atomic_swap
(
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_ctx_longlong_atomic_swap
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_longlong_atomic_swap
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
long
long
rocshmem_ctx_longlong_atomic_swap
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
long
long
rocshmem_longlong_atomic_swap
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_ctx_uint_atomic_swap
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_uint_atomic_swap
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_ctx_uint_atomic_swap
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_uint_atomic_swap
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ctx_ulong_atomic_swap
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ulong_atomic_swap
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ctx_ulong_atomic_swap
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ulong_atomic_swap
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_swap
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ulonglong_atomic_swap
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_swap
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ulonglong_atomic_swap
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_ctx_int32_atomic_swap
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_int32_atomic_swap
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_ctx_int32_atomic_swap
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_int32_atomic_swap
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_ctx_int64_atomic_swap
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_int64_atomic_swap
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_ctx_int64_atomic_swap
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_int64_atomic_swap
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_ctx_uint32_atomic_swap
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_uint32_atomic_swap
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_ctx_uint32_atomic_swap
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_uint32_atomic_swap
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_ctx_uint64_atomic_swap
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_uint64_atomic_swap
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_ctx_uint64_atomic_swap
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_uint64_atomic_swap
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_ctx_size_atomic_swap
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_size_atomic_swap
(
size_t
*
dest
,
size_t
value
,
int
pe
);
__host__
size_t
rocshmem_ctx_size_atomic_swap
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
value
,
int
pe
);
__host__
size_t
rocshmem_size_atomic_swap
(
size_t
*
dest
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_swap
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ptrdiff_atomic_swap
(
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_swap
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ptrdiff_atomic_swap
(
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_FETCH_INC
* @brief Atomically add 1 to \p dest on \p pe. The operation
* returns the older value of \p dest to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] pe PE of the remote process.
*
* @return The old value of \p dest before it was incremented by 1.
*/
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
int
rocshmem_int_atomic_fetch_inc
(
int
*
dest
,
int
pe
);
__host__
int
rocshmem_ctx_int_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
pe
);
__host__
int
rocshmem_int_atomic_fetch_inc
(
int
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_ctx_long_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_long_atomic_fetch_inc
(
long
*
dest
,
int
pe
);
__host__
long
rocshmem_ctx_long_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
int
pe
);
__host__
long
rocshmem_long_atomic_fetch_inc
(
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_ctx_longlong_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_longlong_atomic_fetch_inc
(
long
long
*
dest
,
int
pe
);
__host__
long
long
rocshmem_ctx_longlong_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
int
pe
);
__host__
long
long
rocshmem_longlong_atomic_fetch_inc
(
long
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_ctx_uint_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_uint_atomic_fetch_inc
(
unsigned
int
*
dest
,
int
pe
);
__host__
unsigned
int
rocshmem_ctx_uint_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
int
pe
);
__host__
unsigned
int
rocshmem_uint_atomic_fetch_inc
(
unsigned
int
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ulong_atomic_fetch_inc
(
unsigned
long
*
dest
,
int
pe
);
__host__
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
int
pe
);
__host__
unsigned
long
rocshmem_ulong_atomic_fetch_inc
(
unsigned
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_inc
(
unsigned
long
long
*
dest
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_inc
(
unsigned
long
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_ctx_int32_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_int32_atomic_fetch_inc
(
int32_t
*
dest
,
int
pe
);
__host__
int32_t
rocshmem_ctx_int32_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int
pe
);
__host__
int32_t
rocshmem_int32_atomic_fetch_inc
(
int32_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_ctx_int64_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_int64_atomic_fetch_inc
(
int64_t
*
dest
,
int
pe
);
__host__
int64_t
rocshmem_ctx_int64_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int
pe
);
__host__
int64_t
rocshmem_int64_atomic_fetch_inc
(
int64_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_ctx_uint32_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_uint32_atomic_fetch_inc
(
uint32_t
*
dest
,
int
pe
);
__host__
uint32_t
rocshmem_ctx_uint32_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
int
pe
);
__host__
uint32_t
rocshmem_uint32_atomic_fetch_inc
(
uint32_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_ctx_uint64_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_uint64_atomic_fetch_inc
(
uint64_t
*
dest
,
int
pe
);
__host__
uint64_t
rocshmem_ctx_uint64_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
int
pe
);
__host__
uint64_t
rocshmem_uint64_atomic_fetch_inc
(
uint64_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_ctx_size_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_size_atomic_fetch_inc
(
size_t
*
dest
,
int
pe
);
__host__
size_t
rocshmem_ctx_size_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
int
pe
);
__host__
size_t
rocshmem_size_atomic_fetch_inc
(
size_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ptrdiff_atomic_fetch_inc
(
ptrdiff_t
*
dest
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_fetch_inc
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ptrdiff_atomic_fetch_inc
(
ptrdiff_t
*
dest
,
int
pe
);
/**
* @name SHMEM_ATOMIC_INC
* @brief Atomically add 1 to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_atomic_inc
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_atomic_inc
(
int
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_int_atomic_inc
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
pe
);
__host__
void
rocshmem_int_atomic_inc
(
int
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_atomic_inc
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_atomic_inc
(
long
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_long_atomic_inc
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
int
pe
);
__host__
void
rocshmem_long_atomic_inc
(
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_atomic_inc
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_atomic_inc
(
long
long
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_longlong_atomic_inc
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
int
pe
);
__host__
void
rocshmem_longlong_atomic_inc
(
long
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_atomic_inc
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_atomic_inc
(
unsigned
int
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_uint_atomic_inc
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
int
pe
);
__host__
void
rocshmem_uint_atomic_inc
(
unsigned
int
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_atomic_inc
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_atomic_inc
(
unsigned
long
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_atomic_inc
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
int
pe
);
__host__
void
rocshmem_ulong_atomic_inc
(
unsigned
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_atomic_inc
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_atomic_inc
(
unsigned
long
long
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_atomic_inc
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
int
pe
);
__host__
void
rocshmem_ulonglong_atomic_inc
(
unsigned
long
long
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int32_atomic_inc
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int32_atomic_inc
(
int32_t
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_int32_atomic_inc
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int
pe
);
__host__
void
rocshmem_int32_atomic_inc
(
int32_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int64_atomic_inc
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int64_atomic_inc
(
int64_t
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_int64_atomic_inc
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int
pe
);
__host__
void
rocshmem_int64_atomic_inc
(
int64_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint32_atomic_inc
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint32_atomic_inc
(
uint32_t
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_uint32_atomic_inc
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
int
pe
);
__host__
void
rocshmem_uint32_atomic_inc
(
uint32_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint64_atomic_inc
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint64_atomic_inc
(
uint64_t
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_uint64_atomic_inc
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
int
pe
);
__host__
void
rocshmem_uint64_atomic_inc
(
uint64_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_size_atomic_inc
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_size_atomic_inc
(
size_t
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_size_atomic_inc
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
int
pe
);
__host__
void
rocshmem_size_atomic_inc
(
size_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ptrdiff_atomic_inc
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ptrdiff_atomic_inc
(
ptrdiff_t
*
dest
,
int
pe
);
__host__
void
rocshmem_ctx_ptrdiff_atomic_inc
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
int
pe
);
__host__
void
rocshmem_ptrdiff_atomic_inc
(
ptrdiff_t
*
dest
,
int
pe
);
/**
* @name SHMEM_ATOMIC_FETCH_ADD
* @brief Atomically add the value \p val to \p dest on \p pe. The operation
* returns the older value of \p dest to the calling PE.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return The old value of \p dest before the \p val was added.
*/
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int
rocshmem_int_atomic_fetch_add
(
int
*
dest
,
int
value
,
int
pe
);
__host__
int
rocshmem_ctx_int_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__host__
int
rocshmem_int_atomic_fetch_add
(
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_ctx_long_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_long_atomic_fetch_add
(
long
*
dest
,
long
value
,
int
pe
);
__host__
long
rocshmem_ctx_long_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__host__
long
rocshmem_long_atomic_fetch_add
(
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_ctx_longlong_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_longlong_atomic_fetch_add
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
long
long
rocshmem_ctx_longlong_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
long
long
rocshmem_longlong_atomic_fetch_add
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_ctx_uint_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_uint_atomic_fetch_add
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_ctx_uint_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_uint_atomic_fetch_add
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ulong_atomic_fetch_add
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ulong_atomic_fetch_add
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_add
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_add
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_ctx_int32_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_int32_atomic_fetch_add
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_ctx_int32_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_int32_atomic_fetch_add
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_ctx_int64_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_int64_atomic_fetch_add
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_ctx_int64_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_int64_atomic_fetch_add
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_ctx_uint32_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_uint32_atomic_fetch_add
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_ctx_uint32_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_uint32_atomic_fetch_add
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_ctx_uint64_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_uint64_atomic_fetch_add
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_ctx_uint64_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_uint64_atomic_fetch_add
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_ctx_size_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
size_t
rocshmem_size_atomic_fetch_add
(
size_t
*
dest
,
size_t
value
,
int
pe
);
__host__
size_t
rocshmem_ctx_size_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
value
,
int
pe
);
__host__
size_t
rocshmem_size_atomic_fetch_add
(
size_t
*
dest
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
ptrdiff_t
rocshmem_ptrdiff_atomic_fetch_add
(
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ctx_ptrdiff_atomic_fetch_add
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__host__
ptrdiff_t
rocshmem_ptrdiff_atomic_fetch_add
(
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_ADD
* @brief Atomically add the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_atomic_add
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_atomic_add
(
int
*
dest
,
int
value
,
int
pe
);
__host__
void
rocshmem_ctx_int_atomic_add
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__host__
void
rocshmem_int_atomic_add
(
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_atomic_add
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_atomic_add
(
long
*
dest
,
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_long_atomic_add
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__host__
void
rocshmem_long_atomic_add
(
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_atomic_add
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_atomic_add
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_longlong_atomic_add
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
void
rocshmem_longlong_atomic_add
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_atomic_add
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_atomic_add
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint_atomic_add
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_uint_atomic_add
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_atomic_add
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_atomic_add
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_atomic_add
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ulong_atomic_add
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_atomic_add
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_atomic_add
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_atomic_add
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ulonglong_atomic_add
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int32_atomic_add
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int32_atomic_add
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int32_atomic_add
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_int32_atomic_add
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int64_atomic_add
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int64_atomic_add
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int64_atomic_add
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_int64_atomic_add
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint32_atomic_add
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint32_atomic_add
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint32_atomic_add
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_uint32_atomic_add
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint64_atomic_add
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint64_atomic_add
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint64_atomic_add
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_uint64_atomic_add
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_size_atomic_add
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_size_atomic_add
(
size_t
*
dest
,
size_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_size_atomic_add
(
rocshmem_ctx_t
ctx
,
size_t
*
dest
,
size_t
value
,
int
pe
);
__host__
void
rocshmem_size_atomic_add
(
size_t
*
dest
,
size_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ptrdiff_atomic_add
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ptrdiff_atomic_add
(
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_ptrdiff_atomic_add
(
rocshmem_ctx_t
ctx
,
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
__host__
void
rocshmem_ptrdiff_atomic_add
(
ptrdiff_t
*
dest
,
ptrdiff_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_FETCH_AND
* @brief Atomically bitwise-and the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_ctx_uint_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_uint_atomic_fetch_and
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_ctx_uint_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_uint_atomic_fetch_and
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ulong_atomic_fetch_and
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ulong_atomic_fetch_and
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_and
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_and
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_ctx_int32_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_int32_atomic_fetch_and
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_ctx_int32_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_int32_atomic_fetch_and
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_ctx_int64_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_int64_atomic_fetch_and
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_ctx_int64_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_int64_atomic_fetch_and
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_ctx_uint32_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_uint32_atomic_fetch_and
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_ctx_uint32_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_uint32_atomic_fetch_and
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_ctx_uint64_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_uint64_atomic_fetch_and
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_ctx_uint64_atomic_fetch_and
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_uint64_atomic_fetch_and
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_AND
* @brief Atomically bitwise-and the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_atomic_and
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_atomic_and
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint_atomic_and
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_uint_atomic_and
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_atomic_and
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_atomic_and
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_atomic_and
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ulong_atomic_and
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_atomic_and
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_atomic_and
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_atomic_and
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ulonglong_atomic_and
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int32_atomic_and
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int32_atomic_and
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int32_atomic_and
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_int32_atomic_and
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int64_atomic_and
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int64_atomic_and
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int64_atomic_and
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_int64_atomic_and
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint32_atomic_and
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint32_atomic_and
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint32_atomic_and
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_uint32_atomic_and
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint64_atomic_and
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint64_atomic_and
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint64_atomic_and
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_uint64_atomic_and
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_FETCH_OR
* @brief Atomically bitwise-or the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_ctx_uint_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_uint_atomic_fetch_or
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_ctx_uint_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_uint_atomic_fetch_or
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ulong_atomic_fetch_or
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ulong_atomic_fetch_or
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_or
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_or
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_ctx_int32_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_int32_atomic_fetch_or
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_ctx_int32_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_int32_atomic_fetch_or
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_ctx_int64_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_int64_atomic_fetch_or
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_ctx_int64_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_int64_atomic_fetch_or
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_ctx_uint32_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_uint32_atomic_fetch_or
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_ctx_uint32_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_uint32_atomic_fetch_or
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_ctx_uint64_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_uint64_atomic_fetch_or
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_ctx_uint64_atomic_fetch_or
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_uint64_atomic_fetch_or
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_OR
* @brief Atomically bitwise-or the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_atomic_or
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_atomic_or
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint_atomic_or
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_uint_atomic_or
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_atomic_or
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_atomic_or
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_atomic_or
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ulong_atomic_or
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_atomic_or
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_atomic_or
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_atomic_or
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ulonglong_atomic_or
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int32_atomic_or
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int32_atomic_or
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int32_atomic_or
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_int32_atomic_or
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int64_atomic_or
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int64_atomic_or
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int64_atomic_or
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_int64_atomic_or
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint32_atomic_or
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint32_atomic_or
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint32_atomic_or
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_uint32_atomic_or
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint64_atomic_or
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint64_atomic_or
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint64_atomic_or
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_uint64_atomic_or
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_FETCH_XOR
* @brief Atomically bitwise-xor the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return original value
*/
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_ctx_uint_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_uint_atomic_fetch_xor
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_ctx_uint_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
unsigned
int
rocshmem_uint_atomic_fetch_xor
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ulong_atomic_fetch_xor
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ctx_ulong_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
unsigned
long
rocshmem_ulong_atomic_fetch_xor
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_xor
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ctx_ulonglong_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ulonglong_atomic_fetch_xor
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_ctx_int32_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int32_t
rocshmem_int32_atomic_fetch_xor
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_ctx_int32_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
int32_t
rocshmem_int32_atomic_fetch_xor
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_ctx_int64_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
int64_t
rocshmem_int64_atomic_fetch_xor
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_ctx_int64_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
int64_t
rocshmem_int64_atomic_fetch_xor
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_ctx_uint32_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint32_t
rocshmem_uint32_atomic_fetch_xor
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_ctx_uint32_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
uint32_t
rocshmem_uint32_atomic_fetch_xor
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_ctx_uint64_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_uint64_atomic_fetch_xor
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_ctx_uint64_atomic_fetch_xor
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
uint64_t
rocshmem_uint64_atomic_fetch_xor
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
/**
* @name SHMEM_ATOMIC_XOR
* @brief Atomically bitwise-xor the value \p val to \p dest on \p pe.
*
* The operation is blocking.
*
* This function can be called from divergent control paths at per-thread
* granularity.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] val The value to be atomically added.
* @param[in] pe PE of the remote process.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_atomic_xor
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_atomic_xor
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint_atomic_xor
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_uint_atomic_xor
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_atomic_xor
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_atomic_xor
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_atomic_xor
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ulong_atomic_xor
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_atomic_xor
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_atomic_xor
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_atomic_xor
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ulonglong_atomic_xor
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int32_atomic_xor
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int32_atomic_xor
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int32_atomic_xor
(
rocshmem_ctx_t
ctx
,
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__host__
void
rocshmem_int32_atomic_xor
(
int32_t
*
dest
,
int32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int64_atomic_xor
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int64_atomic_xor
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_int64_atomic_xor
(
rocshmem_ctx_t
ctx
,
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__host__
void
rocshmem_int64_atomic_xor
(
int64_t
*
dest
,
int64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint32_atomic_xor
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint32_atomic_xor
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint32_atomic_xor
(
rocshmem_ctx_t
ctx
,
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__host__
void
rocshmem_uint32_atomic_xor
(
uint32_t
*
dest
,
uint32_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint64_atomic_xor
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint64_atomic_xor
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint64_atomic_xor
(
rocshmem_ctx_t
ctx
,
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
__host__
void
rocshmem_uint64_atomic_xor
(
uint64_t
*
dest
,
uint64_t
value
,
int
pe
);
}
// namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_AMO_HPP
rocshmem_dir/include/rocshmem/rocshmem_COLL.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
namespace
rocshmem
{
/**
* @name SHMEM_ALLTOALL
* @brief Exchanges a fixed amount of contiguous data blocks between all pairs
* of PEs participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks transferred per pair of PEs.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
char
*
dest
,
const
char
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
signed
char
*
dest
,
const
signed
char
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_alltoall_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
int
nelems
);
/**
* @name SHMEM_BROADCAST
* @brief Perform a broadcast between PEs in the active set. The caller
* is blocked until the broadcase completes.
*
* This function must be called as a work-group collective.
*
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelement Size of the buffer to participate in the broadcast.
* @param[in] PE_root Zero-based ordinal of the PE, with respect to the
active set, from which the data is copied
* @param[in] PE_start PE to start the reduction.
* @param[in] logPE_stride Stride of PEs participating in the reduction.
* @param[in] PE_size Number PEs participating in the reduction.
* @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must
be of size at least ROCSHMEM_REDUCE_SYNC_SIZE.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_float_broadcast
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_float_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_double_broadcast
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_double_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
char
*
dest
,
const
char
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_char_broadcast
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_char_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
char
*
dest
,
const
char
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
signed
char
*
dest
,
const
signed
char
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_schar_broadcast
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_schar_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
signed
char
*
dest
,
const
signed
char
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_short_broadcast
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_short_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_int_broadcast
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_int_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_long_broadcast
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_long_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_longlong_broadcast
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_longlong_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_uchar_broadcast
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_uchar_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_ushort_broadcast
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_ushort_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_uint_broadcast
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_uint_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_ulong_broadcast
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_ulong_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
int
nelems
,
int
pe_root
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_broadcast_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
int
nelems
,
int
pe_root
);
__host__
void
rocshmem_ctx_ulonglong_broadcast
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
int
nelems
,
int
pe_root
,
int
pe_start
,
int
log_pe_stride
,
int
pe_size
,
long
*
p_sync
);
__host__
void
rocshmem_ctx_ulonglong_broadcast
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
int
nelems
,
int
pe_root
);
/**
* @name SHMEM_FCOLLECT
* @brief Concatenates blocks of data from multiple PEs to an array in every
* PE participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks in source array.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
char
*
dest
,
const
char
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
signed
char
*
dest
,
const
signed
char
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
int
nelems
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_fcollect_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
int
nelems
);
/**
* @name SHMEM_REDUCTIONS
* @brief Perform an allreduce between PEs in the active set. The caller
* is blocked until the reduction completes.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nreduce Size of the buffer to participate in the reduction.
*
* @return int (Zero on successful local completion. Nonzero otherwise.)
*/
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_short_sum_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_short_sum_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_short_min_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_short_min_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_short_max_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_short_max_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_short_prod_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_short_prod_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_short_or_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_short_or_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_short_and_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_short_and_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_short_xor_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_short_xor_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
short
*
dest
,
const
short
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_sum_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_int_sum_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_min_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_int_min_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_max_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_int_max_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_prod_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_int_prod_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_or_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_int_or_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_and_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_int_and_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_xor_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_int_xor_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
int
*
dest
,
const
int
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_long_sum_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_long_sum_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_long_min_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_long_min_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_long_max_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_long_max_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_long_prod_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_long_prod_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_long_or_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_long_or_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_long_and_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_long_and_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_long_xor_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_long_xor_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
*
dest
,
const
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_longlong_sum_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_longlong_sum_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_longlong_min_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_longlong_min_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_longlong_max_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_longlong_max_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_longlong_prod_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_longlong_prod_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_longlong_or_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_longlong_or_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_longlong_and_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_longlong_and_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_longlong_xor_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_longlong_xor_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
long
long
*
dest
,
const
long
long
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_float_sum_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_float_sum_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_float_min_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_float_min_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_float_max_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_float_max_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_float_prod_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_float_prod_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
float
*
dest
,
const
float
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_double_sum_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_double_sum_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_double_min_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_double_min_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_double_max_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_double_max_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nreduce
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_double_prod_reduce_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nreduce
);
__host__
int
rocshmem_ctx_double_prod_reduce
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
,
double
*
dest
,
const
double
*
source
,
int
nreduce
);
/**
* @brief kernel for performing a barrier synchronization.
* Caller enqueues the kernel on given stream
*
* @return void
*/
__global__
ATTR_NO_INLINE
void
rocshmem_barrier_all_kernel
();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_barrier_all
();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_barrier_all_wave
();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_barrier_all_wg
();
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__
void
rocshmem_ctx_barrier
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__
void
rocshmem_ctx_barrier_wave
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__
void
rocshmem_ctx_barrier_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_sync_all
();
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_sync_all_wave
();
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_sync_all_wg
();
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_sync
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_sync_wave
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_sync_wg
(
rocshmem_ctx_t
ctx
,
rocshmem_team_t
team
);
}
// namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
rocshmem_dir/include/rocshmem/rocshmem_P2P_SYNC.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
namespace
rocshmem
{
/**
* @name SHMEM_WAIT_UNTIL
* @brief Block the caller until the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return void
*/
__device__
void
rocshmem_float_wait_until
(
float
*
ivars
,
int
cmp
,
float
val
);
__device__
size_t
rocshmem_float_wait_until_any
(
float
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
float
val
);
__device__
void
rocshmem_float_wait_until_all
(
float
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
float
val
);
__device__
size_t
rocshmem_float_wait_until_some
(
float
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
float
val
);
__device__
size_t
rocshmem_float_wait_until_any_vector
(
float
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
float
val
);
__device__
void
rocshmem_float_wait_until_all_vector
(
float
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
float
val
);
__device__
size_t
rocshmem_float_wait_until_some_vector
(
float
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
float
val
);
__host__
void
rocshmem_float_wait_until
(
float
*
ivars
,
int
cmp
,
float
val
);
__host__
size_t
rocshmem_float_wait_until_any
(
float
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
float
val
);
__host__
void
rocshmem_float_wait_until_all
(
float
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
float
val
);
__host__
size_t
rocshmem_float_wait_until_some
(
float
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
float
val
);
__host__
size_t
rocshmem_float_wait_until_any_vector
(
float
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
float
val
);
__host__
void
rocshmem_float_wait_until_all_vector
(
float
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
float
val
);
__host__
size_t
rocshmem_float_wait_until_some_vector
(
float
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
float
val
);
__device__
void
rocshmem_double_wait_until
(
double
*
ivars
,
int
cmp
,
double
val
);
__device__
size_t
rocshmem_double_wait_until_any
(
double
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
double
val
);
__device__
void
rocshmem_double_wait_until_all
(
double
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
double
val
);
__device__
size_t
rocshmem_double_wait_until_some
(
double
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
double
val
);
__device__
size_t
rocshmem_double_wait_until_any_vector
(
double
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
double
val
);
__device__
void
rocshmem_double_wait_until_all_vector
(
double
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
double
val
);
__device__
size_t
rocshmem_double_wait_until_some_vector
(
double
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
double
val
);
__host__
void
rocshmem_double_wait_until
(
double
*
ivars
,
int
cmp
,
double
val
);
__host__
size_t
rocshmem_double_wait_until_any
(
double
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
double
val
);
__host__
void
rocshmem_double_wait_until_all
(
double
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
double
val
);
__host__
size_t
rocshmem_double_wait_until_some
(
double
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
double
val
);
__host__
size_t
rocshmem_double_wait_until_any_vector
(
double
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
double
val
);
__host__
void
rocshmem_double_wait_until_all_vector
(
double
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
double
val
);
__host__
size_t
rocshmem_double_wait_until_some_vector
(
double
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
double
val
);
__device__
void
rocshmem_char_wait_until
(
char
*
ivars
,
int
cmp
,
char
val
);
__device__
size_t
rocshmem_char_wait_until_any
(
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
char
val
);
__device__
void
rocshmem_char_wait_until_all
(
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
char
val
);
__device__
size_t
rocshmem_char_wait_until_some
(
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
char
val
);
__device__
size_t
rocshmem_char_wait_until_any_vector
(
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
char
val
);
__device__
void
rocshmem_char_wait_until_all_vector
(
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
char
val
);
__device__
size_t
rocshmem_char_wait_until_some_vector
(
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
char
val
);
__host__
void
rocshmem_char_wait_until
(
char
*
ivars
,
int
cmp
,
char
val
);
__host__
size_t
rocshmem_char_wait_until_any
(
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
char
val
);
__host__
void
rocshmem_char_wait_until_all
(
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
char
val
);
__host__
size_t
rocshmem_char_wait_until_some
(
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
char
val
);
__host__
size_t
rocshmem_char_wait_until_any_vector
(
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
char
val
);
__host__
void
rocshmem_char_wait_until_all_vector
(
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
char
val
);
__host__
size_t
rocshmem_char_wait_until_some_vector
(
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
char
val
);
__device__
void
rocshmem_schar_wait_until
(
signed
char
*
ivars
,
int
cmp
,
signed
char
val
);
__device__
size_t
rocshmem_schar_wait_until_any
(
signed
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__device__
void
rocshmem_schar_wait_until_all
(
signed
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__device__
size_t
rocshmem_schar_wait_until_some
(
signed
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__device__
size_t
rocshmem_schar_wait_until_any_vector
(
signed
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__device__
void
rocshmem_schar_wait_until_all_vector
(
signed
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__device__
size_t
rocshmem_schar_wait_until_some_vector
(
signed
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__host__
void
rocshmem_schar_wait_until
(
signed
char
*
ivars
,
int
cmp
,
signed
char
val
);
__host__
size_t
rocshmem_schar_wait_until_any
(
signed
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__host__
void
rocshmem_schar_wait_until_all
(
signed
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__host__
size_t
rocshmem_schar_wait_until_some
(
signed
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__host__
size_t
rocshmem_schar_wait_until_any_vector
(
signed
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__host__
void
rocshmem_schar_wait_until_all_vector
(
signed
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__host__
size_t
rocshmem_schar_wait_until_some_vector
(
signed
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
signed
char
val
);
__device__
void
rocshmem_short_wait_until
(
short
*
ivars
,
int
cmp
,
short
val
);
__device__
size_t
rocshmem_short_wait_until_any
(
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
short
val
);
__device__
void
rocshmem_short_wait_until_all
(
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
short
val
);
__device__
size_t
rocshmem_short_wait_until_some
(
short
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
short
val
);
__device__
size_t
rocshmem_short_wait_until_any_vector
(
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
short
val
);
__device__
void
rocshmem_short_wait_until_all_vector
(
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
short
val
);
__device__
size_t
rocshmem_short_wait_until_some_vector
(
short
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
short
val
);
__host__
void
rocshmem_short_wait_until
(
short
*
ivars
,
int
cmp
,
short
val
);
__host__
size_t
rocshmem_short_wait_until_any
(
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
short
val
);
__host__
void
rocshmem_short_wait_until_all
(
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
short
val
);
__host__
size_t
rocshmem_short_wait_until_some
(
short
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
short
val
);
__host__
size_t
rocshmem_short_wait_until_any_vector
(
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
short
val
);
__host__
void
rocshmem_short_wait_until_all_vector
(
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
short
val
);
__host__
size_t
rocshmem_short_wait_until_some_vector
(
short
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
short
val
);
__device__
void
rocshmem_int_wait_until
(
int
*
ivars
,
int
cmp
,
int
val
);
__device__
size_t
rocshmem_int_wait_until_any
(
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
int
val
);
__device__
void
rocshmem_int_wait_until_all
(
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
int
val
);
__device__
size_t
rocshmem_int_wait_until_some
(
int
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
int
val
);
__device__
size_t
rocshmem_int_wait_until_any_vector
(
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
int
val
);
__device__
void
rocshmem_int_wait_until_all_vector
(
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
int
val
);
__device__
size_t
rocshmem_int_wait_until_some_vector
(
int
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
int
val
);
__host__
void
rocshmem_int_wait_until
(
int
*
ivars
,
int
cmp
,
int
val
);
__host__
size_t
rocshmem_int_wait_until_any
(
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
int
val
);
__host__
void
rocshmem_int_wait_until_all
(
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
int
val
);
__host__
size_t
rocshmem_int_wait_until_some
(
int
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
int
val
);
__host__
size_t
rocshmem_int_wait_until_any_vector
(
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
int
val
);
__host__
void
rocshmem_int_wait_until_all_vector
(
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
int
val
);
__host__
size_t
rocshmem_int_wait_until_some_vector
(
int
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
int
val
);
__device__
void
rocshmem_long_wait_until
(
long
*
ivars
,
int
cmp
,
long
val
);
__device__
size_t
rocshmem_long_wait_until_any
(
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
val
);
__device__
void
rocshmem_long_wait_until_all
(
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
val
);
__device__
size_t
rocshmem_long_wait_until_some
(
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
long
val
);
__device__
size_t
rocshmem_long_wait_until_any_vector
(
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
val
);
__device__
void
rocshmem_long_wait_until_all_vector
(
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
val
);
__device__
size_t
rocshmem_long_wait_until_some_vector
(
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
long
val
);
__host__
void
rocshmem_long_wait_until
(
long
*
ivars
,
int
cmp
,
long
val
);
__host__
size_t
rocshmem_long_wait_until_any
(
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
val
);
__host__
void
rocshmem_long_wait_until_all
(
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
val
);
__host__
size_t
rocshmem_long_wait_until_some
(
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
long
val
);
__host__
size_t
rocshmem_long_wait_until_any_vector
(
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
val
);
__host__
void
rocshmem_long_wait_until_all_vector
(
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
val
);
__host__
size_t
rocshmem_long_wait_until_some_vector
(
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
long
val
);
__device__
void
rocshmem_longlong_wait_until
(
long
long
*
ivars
,
int
cmp
,
long
long
val
);
__device__
size_t
rocshmem_longlong_wait_until_any
(
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__device__
void
rocshmem_longlong_wait_until_all
(
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__device__
size_t
rocshmem_longlong_wait_until_some
(
long
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__device__
size_t
rocshmem_longlong_wait_until_any_vector
(
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__device__
void
rocshmem_longlong_wait_until_all_vector
(
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__device__
size_t
rocshmem_longlong_wait_until_some_vector
(
long
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__host__
void
rocshmem_longlong_wait_until
(
long
long
*
ivars
,
int
cmp
,
long
long
val
);
__host__
size_t
rocshmem_longlong_wait_until_any
(
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__host__
void
rocshmem_longlong_wait_until_all
(
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__host__
size_t
rocshmem_longlong_wait_until_some
(
long
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__host__
size_t
rocshmem_longlong_wait_until_any_vector
(
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__host__
void
rocshmem_longlong_wait_until_all_vector
(
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__host__
size_t
rocshmem_longlong_wait_until_some_vector
(
long
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
long
long
val
);
__device__
void
rocshmem_uchar_wait_until
(
unsigned
char
*
ivars
,
int
cmp
,
unsigned
char
val
);
__device__
size_t
rocshmem_uchar_wait_until_any
(
unsigned
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__device__
void
rocshmem_uchar_wait_until_all
(
unsigned
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__device__
size_t
rocshmem_uchar_wait_until_some
(
unsigned
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__device__
size_t
rocshmem_uchar_wait_until_any_vector
(
unsigned
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__device__
void
rocshmem_uchar_wait_until_all_vector
(
unsigned
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__device__
size_t
rocshmem_uchar_wait_until_some_vector
(
unsigned
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__host__
void
rocshmem_uchar_wait_until
(
unsigned
char
*
ivars
,
int
cmp
,
unsigned
char
val
);
__host__
size_t
rocshmem_uchar_wait_until_any
(
unsigned
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__host__
void
rocshmem_uchar_wait_until_all
(
unsigned
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__host__
size_t
rocshmem_uchar_wait_until_some
(
unsigned
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__host__
size_t
rocshmem_uchar_wait_until_any_vector
(
unsigned
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__host__
void
rocshmem_uchar_wait_until_all_vector
(
unsigned
char
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__host__
size_t
rocshmem_uchar_wait_until_some_vector
(
unsigned
char
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
char
val
);
__device__
void
rocshmem_ushort_wait_until
(
unsigned
short
*
ivars
,
int
cmp
,
unsigned
short
val
);
__device__
size_t
rocshmem_ushort_wait_until_any
(
unsigned
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__device__
void
rocshmem_ushort_wait_until_all
(
unsigned
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__device__
size_t
rocshmem_ushort_wait_until_some
(
unsigned
short
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__device__
size_t
rocshmem_ushort_wait_until_any_vector
(
unsigned
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__device__
void
rocshmem_ushort_wait_until_all_vector
(
unsigned
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__device__
size_t
rocshmem_ushort_wait_until_some_vector
(
unsigned
short
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__host__
void
rocshmem_ushort_wait_until
(
unsigned
short
*
ivars
,
int
cmp
,
unsigned
short
val
);
__host__
size_t
rocshmem_ushort_wait_until_any
(
unsigned
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__host__
void
rocshmem_ushort_wait_until_all
(
unsigned
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__host__
size_t
rocshmem_ushort_wait_until_some
(
unsigned
short
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__host__
size_t
rocshmem_ushort_wait_until_any_vector
(
unsigned
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__host__
void
rocshmem_ushort_wait_until_all_vector
(
unsigned
short
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__host__
size_t
rocshmem_ushort_wait_until_some_vector
(
unsigned
short
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
short
val
);
__device__
void
rocshmem_uint_wait_until
(
unsigned
int
*
ivars
,
int
cmp
,
unsigned
int
val
);
__device__
size_t
rocshmem_uint_wait_until_any
(
unsigned
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__device__
void
rocshmem_uint_wait_until_all
(
unsigned
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__device__
size_t
rocshmem_uint_wait_until_some
(
unsigned
int
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__device__
size_t
rocshmem_uint_wait_until_any_vector
(
unsigned
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__device__
void
rocshmem_uint_wait_until_all_vector
(
unsigned
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__device__
size_t
rocshmem_uint_wait_until_some_vector
(
unsigned
int
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__host__
void
rocshmem_uint_wait_until
(
unsigned
int
*
ivars
,
int
cmp
,
unsigned
int
val
);
__host__
size_t
rocshmem_uint_wait_until_any
(
unsigned
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__host__
void
rocshmem_uint_wait_until_all
(
unsigned
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__host__
size_t
rocshmem_uint_wait_until_some
(
unsigned
int
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__host__
size_t
rocshmem_uint_wait_until_any_vector
(
unsigned
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__host__
void
rocshmem_uint_wait_until_all_vector
(
unsigned
int
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__host__
size_t
rocshmem_uint_wait_until_some_vector
(
unsigned
int
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
int
val
);
__device__
void
rocshmem_ulong_wait_until
(
unsigned
long
*
ivars
,
int
cmp
,
unsigned
long
val
);
__device__
size_t
rocshmem_ulong_wait_until_any
(
unsigned
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__device__
void
rocshmem_ulong_wait_until_all
(
unsigned
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__device__
size_t
rocshmem_ulong_wait_until_some
(
unsigned
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__device__
size_t
rocshmem_ulong_wait_until_any_vector
(
unsigned
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__device__
void
rocshmem_ulong_wait_until_all_vector
(
unsigned
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__device__
size_t
rocshmem_ulong_wait_until_some_vector
(
unsigned
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__host__
void
rocshmem_ulong_wait_until
(
unsigned
long
*
ivars
,
int
cmp
,
unsigned
long
val
);
__host__
size_t
rocshmem_ulong_wait_until_any
(
unsigned
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__host__
void
rocshmem_ulong_wait_until_all
(
unsigned
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__host__
size_t
rocshmem_ulong_wait_until_some
(
unsigned
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__host__
size_t
rocshmem_ulong_wait_until_any_vector
(
unsigned
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__host__
void
rocshmem_ulong_wait_until_all_vector
(
unsigned
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__host__
size_t
rocshmem_ulong_wait_until_some_vector
(
unsigned
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
long
val
);
__device__
void
rocshmem_ulonglong_wait_until
(
unsigned
long
long
*
ivars
,
int
cmp
,
unsigned
long
long
val
);
__device__
size_t
rocshmem_ulonglong_wait_until_any
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__device__
void
rocshmem_ulonglong_wait_until_all
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__device__
size_t
rocshmem_ulonglong_wait_until_some
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__device__
size_t
rocshmem_ulonglong_wait_until_any_vector
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__device__
void
rocshmem_ulonglong_wait_until_all_vector
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__device__
size_t
rocshmem_ulonglong_wait_until_some_vector
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__host__
void
rocshmem_ulonglong_wait_until
(
unsigned
long
long
*
ivars
,
int
cmp
,
unsigned
long
long
val
);
__host__
size_t
rocshmem_ulonglong_wait_until_any
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__host__
void
rocshmem_ulonglong_wait_until_all
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__host__
size_t
rocshmem_ulonglong_wait_until_some
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__host__
size_t
rocshmem_ulonglong_wait_until_any_vector
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__host__
void
rocshmem_ulonglong_wait_until_all_vector
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
__host__
size_t
rocshmem_ulonglong_wait_until_some_vector
(
unsigned
long
long
*
ivars
,
size_t
nelems
,
size_t
*
indices
,
const
int
*
status
,
int
cmp
,
unsigned
long
long
val
);
/**
* @name SHMEM_TEST
* @brief test if the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return 1 if the evaluation is true else 0
*/
__device__
int
rocshmem_float_test
(
float
*
ivars
,
int
cmp
,
float
val
);
__host__
int
rocshmem_float_test
(
float
*
ivars
,
int
cmp
,
float
val
);
__device__
int
rocshmem_double_test
(
double
*
ivars
,
int
cmp
,
double
val
);
__host__
int
rocshmem_double_test
(
double
*
ivars
,
int
cmp
,
double
val
);
__device__
int
rocshmem_char_test
(
char
*
ivars
,
int
cmp
,
char
val
);
__host__
int
rocshmem_char_test
(
char
*
ivars
,
int
cmp
,
char
val
);
__device__
int
rocshmem_schar_test
(
signed
char
*
ivars
,
int
cmp
,
signed
char
val
);
__host__
int
rocshmem_schar_test
(
signed
char
*
ivars
,
int
cmp
,
signed
char
val
);
__device__
int
rocshmem_short_test
(
short
*
ivars
,
int
cmp
,
short
val
);
__host__
int
rocshmem_short_test
(
short
*
ivars
,
int
cmp
,
short
val
);
__device__
int
rocshmem_int_test
(
int
*
ivars
,
int
cmp
,
int
val
);
__host__
int
rocshmem_int_test
(
int
*
ivars
,
int
cmp
,
int
val
);
__device__
int
rocshmem_long_test
(
long
*
ivars
,
int
cmp
,
long
val
);
__host__
int
rocshmem_long_test
(
long
*
ivars
,
int
cmp
,
long
val
);
__device__
int
rocshmem_longlong_test
(
long
long
*
ivars
,
int
cmp
,
long
long
val
);
__host__
int
rocshmem_longlong_test
(
long
long
*
ivars
,
int
cmp
,
long
long
val
);
__device__
int
rocshmem_uchar_test
(
unsigned
char
*
ivars
,
int
cmp
,
unsigned
char
val
);
__host__
int
rocshmem_uchar_test
(
unsigned
char
*
ivars
,
int
cmp
,
unsigned
char
val
);
__device__
int
rocshmem_ushort_test
(
unsigned
short
*
ivars
,
int
cmp
,
unsigned
short
val
);
__host__
int
rocshmem_ushort_test
(
unsigned
short
*
ivars
,
int
cmp
,
unsigned
short
val
);
__device__
int
rocshmem_uint_test
(
unsigned
int
*
ivars
,
int
cmp
,
unsigned
int
val
);
__host__
int
rocshmem_uint_test
(
unsigned
int
*
ivars
,
int
cmp
,
unsigned
int
val
);
__device__
int
rocshmem_ulong_test
(
unsigned
long
*
ivars
,
int
cmp
,
unsigned
long
val
);
__host__
int
rocshmem_ulong_test
(
unsigned
long
*
ivars
,
int
cmp
,
unsigned
long
val
);
__device__
int
rocshmem_ulonglong_test
(
unsigned
long
long
*
ivars
,
int
cmp
,
unsigned
long
long
val
);
__host__
int
rocshmem_ulonglong_test
(
unsigned
long
long
*
ivars
,
int
cmp
,
unsigned
long
long
val
);
}
// namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
rocshmem_dir/include/rocshmem/rocshmem_RMA.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
namespace
rocshmem
{
/**
* @name SHMEM_PUT
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_float_put
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_float_put
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_double_put
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_double_put
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_char_put
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_char_put
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_schar_put
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_schar_put
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_short_put
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_short_put
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_int_put
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_int_put
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_long_put
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_long_put
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_longlong_put
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_longlong_put
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_uchar_put
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_uchar_put
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ushort_put
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ushort_put
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_uint_put
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_uint_put
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_put
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ulong_put
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_put
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ulonglong_put
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into __host__ rocshmem_quiet() if remote completion is required.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__host__
void
rocshmem_ctx_putmem
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_putmem
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @name SHMEM_P
* @brief Writes a single value to \p dest at \p pe PE to \p dst at \p pe.
* The caller must call into rocshmem_quiet() if remote completion is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] value Value to write to dest at \p pe.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_p
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
float
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_p
(
float
*
dest
,
float
value
,
int
pe
);
__host__
void
rocshmem_ctx_float_p
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
float
value
,
int
pe
);
__host__
void
rocshmem_float_p
(
float
*
dest
,
float
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_p
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
double
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_p
(
double
*
dest
,
double
value
,
int
pe
);
__host__
void
rocshmem_ctx_double_p
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
double
value
,
int
pe
);
__host__
void
rocshmem_double_p
(
double
*
dest
,
double
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_p
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
char
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_p
(
char
*
dest
,
char
value
,
int
pe
);
__host__
void
rocshmem_ctx_char_p
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
char
value
,
int
pe
);
__host__
void
rocshmem_char_p
(
char
*
dest
,
char
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_p
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
signed
char
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_p
(
signed
char
*
dest
,
signed
char
value
,
int
pe
);
__host__
void
rocshmem_ctx_schar_p
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
signed
char
value
,
int
pe
);
__host__
void
rocshmem_schar_p
(
signed
char
*
dest
,
signed
char
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_p
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
short
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_p
(
short
*
dest
,
short
value
,
int
pe
);
__host__
void
rocshmem_ctx_short_p
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
short
value
,
int
pe
);
__host__
void
rocshmem_short_p
(
short
*
dest
,
short
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_p
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_p
(
int
*
dest
,
int
value
,
int
pe
);
__host__
void
rocshmem_ctx_int_p
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
int
value
,
int
pe
);
__host__
void
rocshmem_int_p
(
int
*
dest
,
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_p
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_p
(
long
*
dest
,
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_long_p
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
long
value
,
int
pe
);
__host__
void
rocshmem_long_p
(
long
*
dest
,
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_p
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_p
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_longlong_p
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
long
long
value
,
int
pe
);
__host__
void
rocshmem_longlong_p
(
long
long
*
dest
,
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_p
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
unsigned
char
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_p
(
unsigned
char
*
dest
,
unsigned
char
value
,
int
pe
);
__host__
void
rocshmem_ctx_uchar_p
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
unsigned
char
value
,
int
pe
);
__host__
void
rocshmem_uchar_p
(
unsigned
char
*
dest
,
unsigned
char
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_p
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
unsigned
short
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_p
(
unsigned
short
*
dest
,
unsigned
short
value
,
int
pe
);
__host__
void
rocshmem_ctx_ushort_p
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
unsigned
short
value
,
int
pe
);
__host__
void
rocshmem_ushort_p
(
unsigned
short
*
dest
,
unsigned
short
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_p
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_p
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_ctx_uint_p
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__host__
void
rocshmem_uint_p
(
unsigned
int
*
dest
,
unsigned
int
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_p
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_p
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_p
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__host__
void
rocshmem_ulong_p
(
unsigned
long
*
dest
,
unsigned
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_p
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_p
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_p
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
__host__
void
rocshmem_ulonglong_p
(
unsigned
long
long
*
dest
,
unsigned
long
long
value
,
int
pe
);
/**
* @name SHMEM_GET
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_get
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_get
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_float_get
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_float_get
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_get
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_get
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_double_get
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_double_get
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_get
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_get
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_char_get
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_char_get
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_get
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_get
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_schar_get
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_schar_get
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_get
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_get
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_short_get
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_short_get
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_get
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_get
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_int_get
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_int_get
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_get
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_get
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_long_get
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_long_get
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_get
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_get
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_longlong_get
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_longlong_get
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_get
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_get
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_uchar_get
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_uchar_get
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_get
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_get
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ushort_get
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ushort_get
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_get
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_get
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_uint_get
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_uint_get
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_get
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_get
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_get
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ulong_get
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_get
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_get
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_get
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ulonglong_get
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_getmem
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_getmem
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__host__
void
rocshmem_ctx_getmem
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_getmem
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @name SHMEM_G
* @brief reads and returns single value from \p source at \p pe.
* The calling work-group/thread will block until the operation completes.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] source Source address. Must be an address on the symmetric
* heap.
* @param[in] pe PE of the remote process.
*
* @return the value read from remote \p source at \p pe.
*/
__device__
ATTR_NO_INLINE
float
rocshmem_ctx_float_g
(
rocshmem_ctx_t
ctx
,
const
float
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
float
rocshmem_float_g
(
const
float
*
source
,
int
pe
);
__host__
float
rocshmem_ctx_float_g
(
rocshmem_ctx_t
ctx
,
const
float
*
source
,
int
pe
);
__host__
float
rocshmem_float_g
(
const
float
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
double
rocshmem_ctx_double_g
(
rocshmem_ctx_t
ctx
,
const
double
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
double
rocshmem_double_g
(
const
double
*
source
,
int
pe
);
__host__
double
rocshmem_ctx_double_g
(
rocshmem_ctx_t
ctx
,
const
double
*
source
,
int
pe
);
__host__
double
rocshmem_double_g
(
const
double
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
char
rocshmem_ctx_char_g
(
rocshmem_ctx_t
ctx
,
const
char
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
char
rocshmem_char_g
(
const
char
*
source
,
int
pe
);
__host__
char
rocshmem_ctx_char_g
(
rocshmem_ctx_t
ctx
,
const
char
*
source
,
int
pe
);
__host__
char
rocshmem_char_g
(
const
char
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
signed
char
rocshmem_ctx_schar_g
(
rocshmem_ctx_t
ctx
,
const
signed
char
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
signed
char
rocshmem_schar_g
(
const
signed
char
*
source
,
int
pe
);
__host__
signed
char
rocshmem_ctx_schar_g
(
rocshmem_ctx_t
ctx
,
const
signed
char
*
source
,
int
pe
);
__host__
signed
char
rocshmem_schar_g
(
const
signed
char
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
short
rocshmem_ctx_short_g
(
rocshmem_ctx_t
ctx
,
const
short
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
short
rocshmem_short_g
(
const
short
*
source
,
int
pe
);
__host__
short
rocshmem_ctx_short_g
(
rocshmem_ctx_t
ctx
,
const
short
*
source
,
int
pe
);
__host__
short
rocshmem_short_g
(
const
short
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
int
rocshmem_ctx_int_g
(
rocshmem_ctx_t
ctx
,
const
int
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
int
rocshmem_int_g
(
const
int
*
source
,
int
pe
);
__host__
int
rocshmem_ctx_int_g
(
rocshmem_ctx_t
ctx
,
const
int
*
source
,
int
pe
);
__host__
int
rocshmem_int_g
(
const
int
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_ctx_long_g
(
rocshmem_ctx_t
ctx
,
const
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
long
rocshmem_long_g
(
const
long
*
source
,
int
pe
);
__host__
long
rocshmem_ctx_long_g
(
rocshmem_ctx_t
ctx
,
const
long
*
source
,
int
pe
);
__host__
long
rocshmem_long_g
(
const
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_ctx_longlong_g
(
rocshmem_ctx_t
ctx
,
const
long
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
long
long
rocshmem_longlong_g
(
const
long
long
*
source
,
int
pe
);
__host__
long
long
rocshmem_ctx_longlong_g
(
rocshmem_ctx_t
ctx
,
const
long
long
*
source
,
int
pe
);
__host__
long
long
rocshmem_longlong_g
(
const
long
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
char
rocshmem_ctx_uchar_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
char
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
char
rocshmem_uchar_g
(
const
unsigned
char
*
source
,
int
pe
);
__host__
unsigned
char
rocshmem_ctx_uchar_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
char
*
source
,
int
pe
);
__host__
unsigned
char
rocshmem_uchar_g
(
const
unsigned
char
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
short
rocshmem_ctx_ushort_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
short
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
short
rocshmem_ushort_g
(
const
unsigned
short
*
source
,
int
pe
);
__host__
unsigned
short
rocshmem_ctx_ushort_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
short
*
source
,
int
pe
);
__host__
unsigned
short
rocshmem_ushort_g
(
const
unsigned
short
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_ctx_uint_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
int
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
int
rocshmem_uint_g
(
const
unsigned
int
*
source
,
int
pe
);
__host__
unsigned
int
rocshmem_ctx_uint_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
int
*
source
,
int
pe
);
__host__
unsigned
int
rocshmem_uint_g
(
const
unsigned
int
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ctx_ulong_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
rocshmem_ulong_g
(
const
unsigned
long
*
source
,
int
pe
);
__host__
unsigned
long
rocshmem_ctx_ulong_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
long
*
source
,
int
pe
);
__host__
unsigned
long
rocshmem_ulong_g
(
const
unsigned
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ctx_ulonglong_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
long
long
*
source
,
int
pe
);
__device__
ATTR_NO_INLINE
unsigned
long
long
rocshmem_ulonglong_g
(
const
unsigned
long
long
*
source
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ctx_ulonglong_g
(
rocshmem_ctx_t
ctx
,
const
unsigned
long
long
*
source
,
int
pe
);
__host__
unsigned
long
long
rocshmem_ulonglong_g
(
const
unsigned
long
long
*
source
,
int
pe
);
/**
* @name SHMEM_PUT_NBI
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_nbi
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_nbi
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_float_put_nbi
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_float_put_nbi
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_nbi
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_nbi
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_double_put_nbi
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_double_put_nbi
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_nbi
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_nbi
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_char_put_nbi
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_char_put_nbi
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_nbi
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_nbi
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_schar_put_nbi
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_schar_put_nbi
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_nbi
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_nbi
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_short_put_nbi
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_short_put_nbi
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_nbi
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_nbi
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_int_put_nbi
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_int_put_nbi
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_nbi
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_nbi
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_long_put_nbi
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_long_put_nbi
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_nbi
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_nbi
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_longlong_put_nbi
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_longlong_put_nbi
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_nbi
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_uchar_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_uchar_put_nbi
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_nbi
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ushort_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ushort_put_nbi
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_nbi
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_uint_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_uint_put_nbi
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_nbi
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ulong_put_nbi
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_nbi
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_put_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ulonglong_put_nbi
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_nbi
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_nbi
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* _host__ rocshmem_quiet() if completion notification is required.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__host__
void
rocshmem_ctx_putmem_nbi
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_putmem_nbi
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @name SHMEM_GET_NBI
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller will
* return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_get_nbi
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_get_nbi
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_float_get_nbi
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_float_get_nbi
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_get_nbi
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_get_nbi
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_double_get_nbi
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_double_get_nbi
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_get_nbi
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_get_nbi
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_char_get_nbi
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_char_get_nbi
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_get_nbi
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_get_nbi
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_schar_get_nbi
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_schar_get_nbi
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_get_nbi
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_get_nbi
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_short_get_nbi
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_short_get_nbi
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_get_nbi
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_get_nbi
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_int_get_nbi
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_int_get_nbi
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_get_nbi
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_get_nbi
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_long_get_nbi
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_long_get_nbi
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_get_nbi
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_get_nbi
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_longlong_get_nbi
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_longlong_get_nbi
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_get_nbi
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_uchar_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_uchar_get_nbi
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_get_nbi
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ushort_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ushort_get_nbi
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_get_nbi
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_uint_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_uint_get_nbi
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_get_nbi
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ulong_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ulong_get_nbi
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_get_nbi
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ctx_ulonglong_get_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_ulonglong_get_nbi
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller will
* return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_getmem_nbi
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_getmem_nbi
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller will
* return as soon as the request is posted. The caller must call
* __host__ rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__host__
void
rocshmem_ctx_getmem_nbi
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__host__
void
rocshmem_getmem_nbi
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
}
// namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_HPP
rocshmem_dir/include/rocshmem/rocshmem_RMA_X.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
namespace
rocshmem
{
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in a wave must collectively participate
* in the call using the same arguments
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_wave
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_wave
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_wave
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_wave
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_wave
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_wave
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_wave
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_wave
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_wave
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_wave
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_wave
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_wave
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_wave
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_wave
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_wave
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_wave
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_wave
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_wave
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_wave
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_wave
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_wave
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_wave
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_wave
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_wave
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-workgroup
* (WG) granularity. However, All threads in a WG must collectively participate
* in the call using the same arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_wg
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_wg
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_wg
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_wg
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_wg
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_wg
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_wg
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_wg
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_wg
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_wg
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_wg
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_wg
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_wg
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_wg
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_wg
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_wg
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_wg
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_wg
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_wg
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_wg
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_wg
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_wg
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_wg
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_wg
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in a wave must participate in the
* call using the same parameters.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_wave
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_wave
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest at \p pe. The caller will block until the operation
* completes locally (it is safe to reuse \p source). The caller must
* call into rocshmem_quiet() if remote completion is required.
*
* This function can be called from divergent control paths at per-workgroup
* (WG) granularity. However, all threads in the workgroup must participate in
* the call using the same parameters.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in number of elements.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_wg
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_wg
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must participate in the
* call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_get_wave
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_get_wave
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_get_wave
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_get_wave
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_get_wave
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_get_wave
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_get_wave
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_get_wave
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_get_wave
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_get_wave
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_get_wave
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_get_wave
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_get_wave
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_get_wave
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_get_wave
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_get_wave
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_get_wave
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_get_wave
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_get_wave
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_get_wave
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_get_wave
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_get_wave
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_get_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_get_wave
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_get_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_get_wave
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the workgroup must participate in
* the call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_get_wg
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_get_wg
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_get_wg
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_get_wg
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_get_wg
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_get_wg
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_get_wg
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_get_wg
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_get_wg
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_get_wg
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_get_wg
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_get_wg
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_get_wg
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_get_wg
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_get_wg
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_get_wg
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_get_wg
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_get_wg
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_get_wg
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_get_wg
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_get_wg
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_get_wg
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_get_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_get_wg
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_get_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_get_wg
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in a the wave must participate in the
* call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_getmem_wave
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_getmem_wave
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The calling work-group will block until the
* operation completes (data has been placed in \p dest).
*
* This function can be called from divergent control paths at per-workgroup
* (WG) granularity. However, all threads in the workgroup must participate
* in the call using the same parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_getmem_wg
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_getmem_wg
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_nbi_wave
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_nbi_wave
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_nbi_wave
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_nbi_wave
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_nbi_wave
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_nbi_wave
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_nbi_wave
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_nbi_wave
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_nbi_wave
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_nbi_wave
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_nbi_wave
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_nbi_wave
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_nbi_wave
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems elements from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the WG must call in with the sameo
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_nbi_wg
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_nbi_wg
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_nbi_wg
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_nbi_wg
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_nbi_wg
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_nbi_wg
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_nbi_wg
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_nbi_wg
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_nbi_wg
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_nbi_wg
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_nbi_wg
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_nbi_wg
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_nbi_wg
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in a wave must call in with the same
* parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_nbi_wave
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_nbi_wave
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Writes contiguous data of \p nelems bytes from \p source on the
* calling PE to \p dest on \p pe. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in a WG must call in with the same
* parameters
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_nbi_wg
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_nbi_wg
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_get_nbi_wave
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_get_nbi_wave
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_get_nbi_wave
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_get_nbi_wave
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_get_nbi_wave
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_get_nbi_wave
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_get_nbi_wave
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_get_nbi_wave
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_get_nbi_wave
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_get_nbi_wave
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_get_nbi_wave
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_get_nbi_wave
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_get_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_get_nbi_wave
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems elements from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the WG must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_get_nbi_wg
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_get_nbi_wg
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_get_nbi_wg
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_get_nbi_wg
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_get_nbi_wg
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_get_nbi_wg
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_get_nbi_wg
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_get_nbi_wg
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_get_nbi_wg
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_get_nbi_wg
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_get_nbi_wg
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_get_nbi_wg
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_get_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_get_nbi_wg
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-wave
* granularity. However, all threads in the wave must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_getmem_nbi_wave
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_getmem_nbi_wave
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
/**
* @brief Reads contiguous data of \p nelems bytes from \p source on \p pe
* to \p dest on the calling PE. The operation is not blocking. The caller
* will return as soon as the request is posted. The caller must call
* rocshmem_quiet() on the same context if completion notification is
* required.
*
* This function can be called from divergent control paths at per-workgroup
* granularity. However, all threads in the WG must call in with the same
* arguments.
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
*
* @return void.
*/
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_getmem_nbi_wg
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_getmem_nbi_wg
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
int
pe
);
}
// namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_RMA_X_HPP
rocshmem_dir/include/rocshmem/rocshmem_SIG_OP.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
namespace
rocshmem
{
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_signal
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_signal
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_signal
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_signal
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_signal
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_signal
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_signal
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_signal
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_signal
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_signal
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_signal
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_signal
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_signal
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_signal
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_signal
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_signal
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_signal
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_signal
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_signal
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_signal
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_signal
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_signal
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_signal
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_signal
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_signal
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_signal
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_signal
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_signal
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_signal_wg
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_signal_wg
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_signal_wg
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_signal_wg
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_signal_wg
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_signal_wg
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_signal_wg
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_signal_wg
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_signal_wg
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_signal_wg
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_signal_wg
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_signal_wg
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_signal_wg
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_signal_wg
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_signal_wg
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_signal_wg
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_signal_wg
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_signal_wg
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_signal_wg
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_signal_wg
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_signal_wg
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_signal_wg
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_signal_wg
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_signal_wg
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_signal_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_signal_wg
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_signal_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_signal_wg
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_signal_wave
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_signal_wave
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_signal_wave
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_signal_wave
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_signal_wave
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_signal_wave
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_signal_wave
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_signal_wave
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_signal_wave
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_signal_wave
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_signal_wave
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_signal_wave
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_signal_wave
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_signal_wave
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_signal_wave
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_signal_wave
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_signal_wave
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_signal_wave
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_signal_wave
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_signal_wave
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_signal_wave
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_signal_wave
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_signal_wave
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_signal_wave
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_signal_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_signal_wave
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_signal_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_signal_wave
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_signal_nbi
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_signal_nbi
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_signal_nbi
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_signal_nbi
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_signal_nbi
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_signal_nbi
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_signal_nbi
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_signal_nbi
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_signal_nbi
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_signal_nbi
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_signal_nbi
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_signal_nbi
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_signal_nbi
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_signal_nbi
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_signal_nbi
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_signal_nbi
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_signal_nbi_wg
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_signal_nbi_wg
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_signal_nbi_wg
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_signal_nbi_wg
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_signal_nbi_wg
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_signal_nbi_wg
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_signal_nbi_wg
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_signal_nbi_wg
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_signal_nbi_wg
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_signal_nbi_wg
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_signal_nbi_wg
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_signal_nbi_wg
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_signal_nbi_wg
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_signal_nbi_wg
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_signal_nbi_wg
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_putmem_signal_nbi_wave
(
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_putmem_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
void
*
dest
,
const
void
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_float_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_float_put_signal_nbi_wave
(
float
*
dest
,
const
float
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_double_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_double_put_signal_nbi_wave
(
double
*
dest
,
const
double
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_char_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_char_put_signal_nbi_wave
(
char
*
dest
,
const
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_schar_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_schar_put_signal_nbi_wave
(
signed
char
*
dest
,
const
signed
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_short_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_short_put_signal_nbi_wave
(
short
*
dest
,
const
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_int_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_int_put_signal_nbi_wave
(
int
*
dest
,
const
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_long_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_long_put_signal_nbi_wave
(
long
*
dest
,
const
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_longlong_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_longlong_put_signal_nbi_wave
(
long
long
*
dest
,
const
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uchar_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uchar_put_signal_nbi_wave
(
unsigned
char
*
dest
,
const
unsigned
char
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ushort_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ushort_put_signal_nbi_wave
(
unsigned
short
*
dest
,
const
unsigned
short
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_uint_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_uint_put_signal_nbi_wave
(
unsigned
int
*
dest
,
const
unsigned
int
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulong_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulong_put_signal_nbi_wave
(
unsigned
long
*
dest
,
const
unsigned
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ctx_ulonglong_put_signal_nbi_wave
(
rocshmem_ctx_t
ctx
,
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
void
rocshmem_ulonglong_put_signal_nbi_wave
(
unsigned
long
long
*
dest
,
const
unsigned
long
long
*
source
,
size_t
nelems
,
uint64_t
*
sig_addr
,
uint64_t
signal
,
int
sig_op
,
int
pe
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_signal_fetch
(
const
uint64_t
*
sig_addr
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_signal_fetch_wg
(
const
uint64_t
*
sig_addr
);
__device__
ATTR_NO_INLINE
uint64_t
rocshmem_signal_fetch_wave
(
const
uint64_t
*
sig_addr
);
}
// namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
rocshmem_dir/include/rocshmem/rocshmem_common.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
namespace
rocshmem
{
#ifdef USE_FUNC_CALL
#define ATTR_NO_INLINE __attribute__((noinline))
#else
#define ATTR_NO_INLINE
#endif
enum
ROCSHMEM_STATUS
{
ROCSHMEM_SUCCESS
=
0
,
ROCSHMEM_ERROR
=
1
,
};
enum
ROCSHMEM_OP
{
ROCSHMEM_SUM
,
ROCSHMEM_MAX
,
ROCSHMEM_MIN
,
ROCSHMEM_PROD
,
ROCSHMEM_AND
,
ROCSHMEM_OR
,
ROCSHMEM_XOR
,
ROCSHMEM_REPLACE
};
enum
ROCSHMEM_SIGNAL_OPS
{
ROCSHMEM_SIGNAL_SET
,
ROCSHMEM_SIGNAL_ADD
,
};
/**
* @brief Types defined for rocshmem_wait() operations.
*/
enum
rocshmem_cmps
{
ROCSHMEM_CMP_EQ
,
ROCSHMEM_CMP_NE
,
ROCSHMEM_CMP_GT
,
ROCSHMEM_CMP_GE
,
ROCSHMEM_CMP_LT
,
ROCSHMEM_CMP_LE
,
};
enum
rocshmem_thread_ops
{
ROCSHMEM_THREAD_SINGLE
,
ROCSHMEM_THREAD_FUNNELED
,
ROCSHMEM_THREAD_WG_FUNNELED
,
ROCSHMEM_THREAD_SERIALIZED
,
ROCSHMEM_THREAD_MULTIPLE
};
/**
* @brief Bitwise flags to mask configuration parameters.
*/
enum
rocshmem_team_configs
{
ROCSHMEM_TEAM_DEFAULT_CONFIGS
,
ROCSHMEM_TEAM_NUM_CONTEXTS
};
typedef
struct
{
int
num_contexts
;
}
rocshmem_team_config_t
;
constexpr
size_t
ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE
=
1024
;
constexpr
size_t
ROCSHMEM_BARRIER_SYNC_SIZE
=
256
;
constexpr
size_t
ROCSHMEM_REDUCE_SYNC_SIZE
=
256
;
// Internally calls sync function, which matches barrier implementation
constexpr
size_t
ROCSHMEM_BCAST_SYNC_SIZE
=
ROCSHMEM_BARRIER_SYNC_SIZE
;
constexpr
size_t
ROCSHMEM_ALLTOALL_SYNC_SIZE
=
ROCSHMEM_BARRIER_SYNC_SIZE
+
1
;
constexpr
size_t
ROCSHMEM_FCOLLECT_SYNC_SIZE
=
ROCSHMEM_ALLTOALL_SYNC_SIZE
;
constexpr
size_t
ROCSHMEM_SYNC_VALUE
=
0
;
const
int
ROCSHMEM_CTX_ZERO
=
0
;
const
int
ROCSHMEM_CTX_NOSTORE
=
1
;
const
int
ROCSHMEM_CTX_SERIALIZED
=
2
;
const
int
ROCSHMEM_CTX_WG_PRIVATE
=
4
;
const
int
ROCSHMEM_CTX_SHARED
=
8
;
/**
* @brief GPU side OpenSHMEM context created from each work-groups'
* rocshmem_wg_handle_t
*/
typedef
struct
rocshmem_ctx
{
void
*
ctx_opaque
;
void
*
team_opaque
;
__host__
__device__
bool
operator
==
(
const
struct
rocshmem_ctx
&
other
)
const
{
return
(
ctx_opaque
==
other
.
ctx_opaque
&&
team_opaque
==
other
.
team_opaque
);
}
__host__
__device__
bool
operator
!=
(
const
struct
rocshmem_ctx
&
other
)
const
{
return
!
(
*
this
==
other
);
}
}
rocshmem_ctx_t
;
/**
* Shmem default context.
*/
extern
"C"
__device__
rocshmem_ctx_t
__attribute__
((
visibility
(
"default"
)))
ROCSHMEM_CTX_DEFAULT
;
/**
* A value corresponding to an invalid communication context. This value can be
* used to initialize or update context handles to indicate that they do not
* reference a valid context. When managed in this way, applications can use an
* equality comparison to test whether a given context handle references a
* valid context.
*/
extern
__constant__
rocshmem_ctx_t
ROCSHMEM_CTX_INVALID
;
/**
* Used internally to set default context.
*/
void
set_internal_ctx
(
rocshmem_ctx_t
*
ctx
);
typedef
uint64_t
*
rocshmem_team_t
;
extern
rocshmem_team_t
ROCSHMEM_TEAM_WORLD
;
const
rocshmem_team_t
ROCSHMEM_TEAM_INVALID
=
nullptr
;
/**
* @brief Data structure defining the unqiueId
*/
/// Unique ID for a process. This is a ROCSHMEM_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
#define ROCSHMEM_UNIQUE_ID_BYTES 128
using
rocshmem_uniqueid_t
=
std
::
array
<
uint8_t
,
ROCSHMEM_UNIQUE_ID_BYTES
>
;
/**
* @brief Data structure used for attribute based
* initialization
*/
struct
rocshmem_init_attr_t
{
int32_t
rank
;
int32_t
nranks
;
rocshmem_uniqueid_t
uid
;
void
*
mpi_comm
;
};
typedef
struct
rocshmem_init_attr_t
rocshmem_init_attr_t
;
constexpr
unsigned
int
ROCSHMEM_INIT_WITH_MPI_COMM
=
0
;
constexpr
unsigned
int
ROCSHMEM_INIT_WITH_UNIQUEID
=
1
;
}
// namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
rocshmem_dir/include/rocshmem/rocshmem_config.h
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
/* #undef DEBUG */
/* #undef PROFILE */
/* #undef USE_RO */
/* #undef USE_IPC */
#define USE_GDA
/* #undef USE_THREADS */
/* #undef USE_SHARED_CTX */
/* #undef USE_WF_COAL */
#define USE_HEAP_DEVICE_FINEGRAIN
/* #undef USE_HEAP_DEVICE_UNCACHED */
/* #undef USE_HEAP_DEVICE_COARSEGRAIN */
/* #undef USE_HEAP_MANAGED */
/* #undef USE_HEAP_HOST_HIP */
/* #undef USE_HEAP_HOST */
#define USE_ALLOC_DLMALLOC
/* #undef USE_ALLOC_POW2BINS */
/* #undef USE_FUNC_CALL */
/* #undef USE_SINGLE_NODE */
/* #undef USE_HDP_FLUSH */
/* #undef USE_HDP_FLUSH_HOST_SIDE */
/* #undef GDA_IONIC */
/* #undef GDA_BNXT */
#define GDA_MLX5
#define HAVE_EXTERNAL_MPI
rocshmem_dir/include/rocshmem/rocshmem_debug.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_DEBUG_HPP
#define LIBRARY_INCLUDE_DEBUG_HPP
namespace
rocshmem
{
void
debug_print_cq
(
int
dest_pe
,
int
src_wg
,
int
cqe_index
);
void
debug_print_sq
(
int
dest_pe
,
int
src_wg
,
int
index_wqe
);
}
// namespace rocshmem
#endif // LIBRARY_INCLUDE_DEBUG_HPP
rocshmem_dir/include/rocshmem/rocshmem_mpi.hpp
deleted
100644 → 0
View file @
b33659dd
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_MPI_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_MPI_HPP
#if defined(HAVE_EXTERNAL_MPI)
#include <mpi.h>
#endif
#if defined(c_plusplus) || defined(__cplusplus)
extern
"C"
{
#endif
#if !defined(MPI_VERSION)
// Open MPI based values for the constants/handles etc.
// Even though we did not include an external MPI header file
// The includer may have (e.g., a unit test).
typedef
void
*
MPI_Comm
;
typedef
void
*
MPI_Win
;
typedef
void
*
MPI_Group
;
typedef
void
*
MPI_Op
;
typedef
void
*
MPI_Datatype
;
typedef
void
*
MPI_Request
;
typedef
void
*
MPI_Info
;
struct
ompi_status_public_t
{
int
MPI_SOURCE
;
int
MPI_TAG
;
int
MPI_ERROR
;
int
_cancelled
;
size_t
_ucount
;
};
typedef
struct
ompi_status_public_t
MPI_Status
;
#define MPI_Aint uint64_t
#define MPI_UNDEFINED -32766
#define MPI_THREAD_MULTIPLE 3
#define MPI_SUCCESS 0
#define MPI_IN_PLACE (void*)1
#define MPI_MODE_NOCHECK 1
#define MPI_COMM_TYPE_SHARED 0
#define MPI_Aint_diff(addr1, addr2) ((MPI_Aint) ((char *) (addr1) - (char *) (addr2)))
struct
ompi_internal_symbols_t
{
void
*
ompi_mpi_comm_world
;
void
*
ompi_mpi_comm_null
;
void
*
ompi_request_null
;
void
*
ompi_mpi_info_null
;
void
*
ompi_mpi_datatype_null
;
void
*
ompi_mpi_op_max
;
void
*
ompi_mpi_op_min
;
void
*
ompi_mpi_op_sum
;
void
*
ompi_mpi_op_prod
;
void
*
ompi_mpi_op_band
;
void
*
ompi_mpi_op_bor
;
void
*
ompi_mpi_op_bxor
;
void
*
ompi_mpi_op_replace
;
void
*
ompi_mpi_op_no_op
;
void
*
ompi_mpi_char
;
void
*
ompi_mpi_unsigned_char
;
void
*
ompi_mpi_signed_char
;
void
*
ompi_mpi_short
;
void
*
ompi_mpi_unsigned_short
;
void
*
ompi_mpi_int
;
void
*
ompi_mpi_unsigned
;
void
*
ompi_mpi_long
;
void
*
ompi_mpi_unsigned_long
;
void
*
ompi_mpi_long_long_int
;
void
*
ompi_mpi_unsigned_long_long
;
void
*
ompi_mpi_float
;
void
*
ompi_mpi_double
;
void
*
ompi_mpi_long_double
;
};
extern
struct
ompi_internal_symbols_t
ompi_symbols_
;
#define OMPI_PREDEFINED_GLOBAL(type, global) (static_cast<type> (global))
#define MPI_COMM_WORLD OMPI_PREDEFINED_GLOBAL(MPI_Comm, ompi_symbols_.ompi_mpi_comm_world)
#define MPI_COMM_NULL OMPI_PREDEFINED_GLOBAL(MPI_Comm, ompi_symbols_.ompi_mpi_comm_null)
#define MPI_REQUEST_NULL OMPI_PREDEFINED_GLOBAL(MPI_Request, ompi_symbols_.ompi_request_null)
#define MPI_WIN_NULL OMPI_PREDEFINED_GLOBAL(MPI_Win, ompi_symbols_.ompi_mpi_win_null)
#define MPI_INFO_NULL OMPI_PREDEFINED_GLOBAL(MPI_Info, ompi_symbols_.ompi_mpi_info_null)
#define MPI_MAX OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_max)
#define MPI_MIN OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_min)
#define MPI_SUM OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_sum)
#define MPI_PROD OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_prod)
#define MPI_BAND OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_band)
#define MPI_BOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_bor)
#define MPI_BXOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_bxor)
#define MPI_REPLACE OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_replace)
#define MPI_NO_OP OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_no_op)
#define MPI_DATATYPE_NULL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_datatype_null)
#define MPI_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_char)
#define MPI_UNSIGNED_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_char)
#define MPI_SIGNED_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_signed_char)
#define MPI_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_short)
#define MPI_UNSIGNED_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_short)
#define MPI_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_int)
#define MPI_UNSIGNED OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned)
#define MPI_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_long)
#define MPI_UNSIGNED_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_long)
#define MPI_LONG_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_long_long_int)
#define MPI_UNSIGNED_LONG_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_long_long)
#define MPI_FLOAT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_float)
#define MPI_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_double)
#define MPI_LONG_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_long_double)
#endif //!defined(MPI_VERSION)
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif //LIBRARY_INCLUDE_ROCSHMEM_MPI_HPP
rocshmem_dir/lib/librocshmem.a
deleted
100644 → 0
View file @
b33659dd
File deleted
rocshmem_dir/share/doc/rocshmem/LICENSE.md
deleted
100644 → 0
View file @
b33659dd
MIT License
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
SPDX-License-Identifier: MIT
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
rocshmem
@
f5a87af2
Subproject commit f5a87af2671b6daaea16ae766ca97db867ef996c
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment