Commit aa3187ef authored by Seth Howell's avatar Seth Howell
Browse files

third-party: Add CPU-assisted IBGDA support



This allows users to use NVSHMEM without setting the driver regkey.
Signed-off-by: default avatarSeth Howell <sethh@nvidia.com>
parent 441833d3
...@@ -146,20 +146,26 @@ template <bool kAlwaysDoPostSend> ...@@ -146,20 +146,26 @@ template <bool kAlwaysDoPostSend>
__device__ static __forceinline__ __device__ static __forceinline__
void ibgda_submit_requests(nvshmemi_ibgda_device_qp_t *qp, uint64_t base_wqe_idx, void ibgda_submit_requests(nvshmemi_ibgda_device_qp_t *qp, uint64_t base_wqe_idx,
uint32_t num_wqes, int message_idx = 0) { uint32_t num_wqes, int message_idx = 0) {
auto state = ibgda_get_state();
nvshmemi_ibgda_device_qp_management_t *mvars = &qp->mvars; nvshmemi_ibgda_device_qp_management_t *mvars = &qp->mvars;
uint64_t new_wqe_idx = base_wqe_idx + num_wqes; uint64_t new_wqe_idx = base_wqe_idx + num_wqes;
// WQE writes must be finished first // WQE writes must be finished first
__threadfence(); __threadfence();
unsigned long long int *ready_idx =
(unsigned long long int *)(state->use_async_postsend ? qp->tx_wq.prod_idx
: &mvars->tx_wq.ready_head);
// Wait for prior WQE slots to be filled first // Wait for prior WQE slots to be filled first
auto *ready_idx = reinterpret_cast<unsigned long long int*>(&mvars->tx_wq.ready_head);
while (atomicCAS(ready_idx, base_wqe_idx, new_wqe_idx) != base_wqe_idx); while (atomicCAS(ready_idx, base_wqe_idx, new_wqe_idx) != base_wqe_idx);
// Always post, not in batch // Always post, not in batch
constexpr int kNumRequestInBatch = 4; if (!state->use_async_postsend) {
if (kAlwaysDoPostSend or (message_idx + 1) % kNumRequestInBatch == 0) constexpr int kNumRequestInBatch = 4;
ibgda_post_send(qp, new_wqe_idx); if (kAlwaysDoPostSend or (message_idx + 1) % kNumRequestInBatch == 0)
ibgda_post_send(qp, new_wqe_idx);
}
} }
__device__ static __forceinline__ void __device__ static __forceinline__ void
...@@ -487,7 +493,8 @@ ibgda_poll_cq(nvshmemi_ibgda_device_cq_t *cq, uint64_t idx) { ...@@ -487,7 +493,8 @@ ibgda_poll_cq(nvshmemi_ibgda_device_cq_t *cq, uint64_t idx) {
__device__ static __forceinline__ void __device__ static __forceinline__ void
nvshmemi_ibgda_quiet(int dst_pe, int qp_id) { nvshmemi_ibgda_quiet(int dst_pe, int qp_id) {
auto qp = ibgda_get_rc(dst_pe, qp_id); auto qp = ibgda_get_rc(dst_pe, qp_id);
uint64_t prod_idx = ld_na_relaxed(qp->tx_wq.prod_idx); auto state = ibgda_get_state();
uint64_t prod_idx = state->use_async_postsend ? ld_na_relaxed(qp->tx_wq.prod_idx) : ld_na_relaxed(&qp->mvars.tx_wq.ready_head);
ibgda_poll_cq(qp->tx_wq.cq, prod_idx); ibgda_poll_cq(qp->tx_wq.cq, prod_idx);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment