nvshmem.patch 21.4 KB
Newer Older
Shangyan Zhou's avatar
Shangyan Zhou committed
1
From 9e6cc27cceb3130784e4ea7b61ea3171156365fd Mon Sep 17 00:00:00 2001
Chenggang Zhao's avatar
Chenggang Zhao committed
2
3
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Fri, 20 Dec 2024 10:57:12 +0800
Shangyan Zhou's avatar
Shangyan Zhou committed
4
Subject: [PATCH 1/4] Change QP creating order.
Chenggang Zhao's avatar
Chenggang Zhao committed
5
6
7
8
9
10

---
 src/modules/transport/ibgda/ibgda.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
Shangyan Zhou's avatar
Shangyan Zhou committed
11
index ef325cd..286132e 100644
Chenggang Zhao's avatar
Chenggang Zhao committed
12
13
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
Shangyan Zhou's avatar
Shangyan Zhou committed
14
@@ -2936,17 +2936,20 @@ int nvshmemt_ibgda_connect_endpoints(nvshmem_transport_t t, int *selected_dev_id
Chenggang Zhao's avatar
Chenggang Zhao committed
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
         INFO(ibgda_state->log_level, "Creating %d RC QPs", device->rc.num_eps_per_pe);
         for (int i = 0; i < num_rc_eps; ++i) {
             // Do not create loopback to self
-            if (i / device->rc.num_eps_per_pe == mype) {
+            int dst_pe = (i + 1 + mype) % n_pes;
+            int offset = i / n_pes;
+            int mapped_i = dst_pe * device->rc.num_eps_per_pe + offset;
+            if (dst_pe == mype) {
                 continue;
             }
-            status = ibgda_create_qp(&device->rc.eps[i], device, portid, i,
+            status = ibgda_create_qp(&device->rc.eps[mapped_i], device, portid, mapped_i,
                                      NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC);
             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
-                                  "ibgda_create_dci failed on RC #%d.", i);
+                                  "ibgda_create_dci failed on RC #%d.", mapped_i);

-            status = ibgda_get_rc_handle(&local_rc_handles[i], device->rc.eps[i], device);
+            status = ibgda_get_rc_handle(&local_rc_handles[mapped_i], device->rc.eps[mapped_i], device);
             NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
-                                  "ibgda_get_rc_handle failed on RC #%d.", i);
+                                  "ibgda_get_rc_handle failed on RC #%d.", mapped_i);
         }

         if (num_rc_eps) {
--
2.25.1


Shangyan Zhou's avatar
Shangyan Zhou committed
44
From b11d41e4f3727f2f6ccc00a8c852e59e2ee33c8a Mon Sep 17 00:00:00 2001
Chenggang Zhao's avatar
Chenggang Zhao committed
45
46
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Fri, 10 Jan 2025 11:53:38 +0800
Shangyan Zhou's avatar
Shangyan Zhou committed
47
Subject: [PATCH 2/4] Add recv queue and recv cq for rc qps.
Chenggang Zhao's avatar
Chenggang Zhao committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

Let the ibgda rc qps use regular recv queue.

Add recv queue to ibgda dev qp.

IBGDA create recv cq

Setup recv cq.

fix recv queue.

Remove some useless idx.

Longer recv queue.
---
 .../nvshmem_common_ibgda.h                    | 19 +++++-
 src/modules/transport/ibgda/ibgda.cpp         | 65 ++++++++++++++++---
 2 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
Shangyan Zhou's avatar
Shangyan Zhou committed
68
index 8b8a263..1be3dec 100644
Chenggang Zhao's avatar
Chenggang Zhao committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -168,14 +168,17 @@ typedef struct {
         uint64_t get_head;    // last wqe idx + 1 with a "fetch" operation (g, get, amo_fetch)
         uint64_t get_tail;    // last wqe idx + 1 polled with cst; get_tail > get_head is possible
     } tx_wq;
+    struct {
+        uint64_t resv_head;   // last reserved wqe idx + 1
+    } rx_wq;
     struct {
         uint64_t head;
         uint64_t tail;
     } ibuf;
     char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
 } __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 96,
-              "ibgda_device_qp_management_v1 must be 96 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
+              "ibgda_device_qp_management_v1 must be 104 bytes.");

 typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;

@@ -199,9 +202,19 @@ typedef struct nvshmemi_ibgda_device_qp {
         // May point to mvars.prod_idx or internal prod_idx
         uint64_t *prod_idx;
     } tx_wq;
+    struct {
+        uint16_t nwqes;
+        uint64_t tail;
+        void *wqe;
+        __be32 *dbrec;
+        void *bf;
+        nvshmemi_ibgda_device_cq_t *cq;
+        // May point to mvars.prod_idx or internal prod_idx
+        uint64_t *prod_idx;
+    } rx_wq;
     nvshmemi_ibgda_device_qp_management_v1 mvars;  // management variables
 } nvshmemi_ibgda_device_qp_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 184, "ibgda_device_qp_v1 must be 184 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");

 typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;

diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
Shangyan Zhou's avatar
Shangyan Zhou committed
113
index 286132e..e0b2d5c 100644
Chenggang Zhao's avatar
Chenggang Zhao committed
114
115
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
Shangyan Zhou's avatar
Shangyan Zhou committed
116
@@ -198,6 +198,7 @@ struct ibgda_ep {
Chenggang Zhao's avatar
Chenggang Zhao committed
117
118
119
120
121
122
123
     off_t dbr_offset;

     struct ibgda_cq *send_cq;
+    struct ibgda_cq *recv_cq;
     struct ibv_ah *ah;

     uint32_t user_index;
Shangyan Zhou's avatar
Shangyan Zhou committed
124
@@ -1538,7 +1539,8 @@ static int ibgda_create_cq_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
Chenggang Zhao's avatar
Chenggang Zhao committed
125
126
127
128
129
130
131
132
133

     struct ibv_context *context = device->context;

-    unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes;
+    // Each RC qp has one send CQ and one recv CQ.
+    unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes * 2;

     assert(ibgda_qp_depth > 0);
     size_t num_cqe = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
Shangyan Zhou's avatar
Shangyan Zhou committed
134
@@ -1701,7 +1703,8 @@ static int ibgda_create_qp_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
Chenggang Zhao's avatar
Chenggang Zhao committed
135
136
137
138
139
140
141
142
143
     }

     // Allocate and map WQ buffer for all QPs.
-    wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB;  // num_wqebb is always a power of 2
+    // Todo: reduce the size of wq buffer.
+    wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB * 2;  // num_wqebb is always a power of 2
     wq_buf_size = wq_buf_size_per_qp * num_eps;
     status = ibgda_nic_control_alloc(&wq_mobject, wq_buf_size, IBGDA_GPAGE_SIZE);
     NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "cannot allocate wq buf.\n");
Shangyan Zhou's avatar
Shangyan Zhou committed
144
@@ -1882,8 +1885,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
Chenggang Zhao's avatar
Chenggang Zhao committed
145
146
147
148
149
150
151
152
153
154
155
     int cqe_version = 0;

     struct ibgda_cq *send_cq = NULL;
+    struct ibgda_cq *recv_cq = NULL;

     size_t num_wqebb = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
+    size_t num_recv_wqe = ibgda_qp_depth;
+    size_t recv_wqe_size = 16;

     int status = 0;

Shangyan Zhou's avatar
Shangyan Zhou committed
156
@@ -1911,6 +1917,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
Chenggang Zhao's avatar
Chenggang Zhao committed
157
158
159
160
161
162
163
164
165
166
167
     status = ibgda_create_cq(&send_cq, device);
     NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");

+    if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
+        status = ibgda_create_cq(&recv_cq, device);
+        NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
+    }
+
     ep = (struct ibgda_ep *)calloc(1, sizeof(struct ibgda_ep));
     NVSHMEMI_NULL_ERROR_JMP(ep, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out,
                             "Unable to allocate mem for ep.\n");
Shangyan Zhou's avatar
Shangyan Zhou committed
168
@@ -1939,12 +1950,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
Chenggang Zhao's avatar
Chenggang Zhao committed
169
170
171
172
173
174
175
176
177
178
179
180
181
     DEVX_SET(qpc, qp_context, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
     DEVX_SET(qpc, qp_context, pd, device->qp_shared_object.pdn);
     DEVX_SET(qpc, qp_context, uar_page, uar_mobject->uar->page_id);  // BF register
-    DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE);        // Shared Receive Queue
-    DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
     DEVX_SET(qpc, qp_context, cqn_snd, send_cq->cqn);
-    DEVX_SET(qpc, qp_context, cqn_rcv, device->qp_shared_object.rcqn);
+    DEVX_SET(qpc, qp_context, cqn_rcv, qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC ? recv_cq->cqn : device->qp_shared_object.rcqn);
     DEVX_SET(qpc, qp_context, log_sq_size, IBGDA_ILOG2_OR0(num_wqebb));
-    DEVX_SET(qpc, qp_context, log_rq_size, 0);
     DEVX_SET(qpc, qp_context, cs_req, 0);                                     // Disable CS Request
     DEVX_SET(qpc, qp_context, cs_res, 0);                                     // Disable CS Response
     DEVX_SET(qpc, qp_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);  // Enable dbr_umem_id
Shangyan Zhou's avatar
Shangyan Zhou committed
182
@@ -1953,6 +1961,15 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
Chenggang Zhao's avatar
Chenggang Zhao committed
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
     DEVX_SET(qpc, qp_context, dbr_umem_id, dbr_umem->umem_id);  // DBR buffer
     DEVX_SET(qpc, qp_context, user_index, qp_idx);
     DEVX_SET(qpc, qp_context, page_offset, 0);
+    if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC){
+        DEVX_SET(qpc, qp_context, rq_type, 0);        // Regular recv queue
+        DEVX_SET(qpc, qp_context, log_rq_size, IBGDA_ILOG2(num_recv_wqe)); // 4 wqe
+        DEVX_SET(qpc, qp_context, log_rq_stride, IBGDA_ILOG2(recv_wqe_size) - 4); // max recv wqe size = 16B
+    } else {
+        DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE);        // Shared Receive Queue, DC must use this.
+        DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
+        DEVX_SET(qpc, qp_context, log_rq_size, 0);
+    }

     ep->devx_qp = mlx5dv_devx_obj_create(context, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out));
     NVSHMEMI_NULL_ERROR_JMP(ep->devx_qp, status, NVSHMEMX_ERROR_INTERNAL, out,
Shangyan Zhou's avatar
Shangyan Zhou committed
198
@@ -1962,9 +1979,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
Chenggang Zhao's avatar
Chenggang Zhao committed
199
200
201
202
203
204
205
206
207
208
209
     ep->portid = portid;

     ep->sq_cnt = num_wqebb;
-    ep->sq_buf_offset = 0;
+    ep->sq_buf_offset = num_recv_wqe * recv_wqe_size;

-    ep->rq_cnt = 0;
+    ep->rq_cnt = num_recv_wqe;
     ep->rq_buf_offset = 0;

     ep->wq_mobject = device->qp_shared_object.wq_mobject;
Shangyan Zhou's avatar
Shangyan Zhou committed
210
@@ -1978,6 +1995,7 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
Chenggang Zhao's avatar
Chenggang Zhao committed
211
212
213
214
215
216
217
     ep->uar_mobject = uar_mobject;

     ep->send_cq = send_cq;
+    ep->recv_cq = recv_cq;

     ep->qp_type = qp_type;

Shangyan Zhou's avatar
Shangyan Zhou committed
218
@@ -1989,6 +2007,7 @@ out:
Chenggang Zhao's avatar
Chenggang Zhao committed
219
220
221
222
223
224
225
     if (status) {
         if (uar_mobject) ibgda_unmap_and_free_qp_uar(uar_mobject);
         if (send_cq) ibgda_destroy_cq(send_cq);
+        if (recv_cq) ibgda_destroy_cq(recv_cq);
         if (ep) free(ep);
     }

Shangyan Zhou's avatar
Shangyan Zhou committed
226
@@ -2287,6 +2306,10 @@ static int ibgda_destroy_ep(struct ibgda_ep *ep) {
Chenggang Zhao's avatar
Chenggang Zhao committed
227
228
229
230
231
232
233
234
235
236
         ibgda_destroy_cq(ep->send_cq);
     }

+    if (ep->recv_cq) {
+        ibgda_destroy_cq(ep->recv_cq);
+    }
+
     if (ep->ah) {
         ftable.destroy_ah(ep->ah);
     }
Shangyan Zhou's avatar
Shangyan Zhou committed
237
@@ -2318,7 +2341,7 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
Chenggang Zhao's avatar
Chenggang Zhao committed
238
239
240
241
242
243
244
245
     dev_qp->qpn = ep->qpn;

     assert(ep->wq_mobject->has_gpu_mapping);
-    dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset);
+    dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->sq_buf_offset);

     if (ibgda_nic_handler == IBGDA_NIC_HANDLER_GPU) {
         assert(ep->dbr_mobject->has_gpu_mapping);
Shangyan Zhou's avatar
Shangyan Zhou committed
246
@@ -2330,6 +2353,12 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
Chenggang Zhao's avatar
Chenggang Zhao committed
247
248
249
250
251
252
253
254
255
256
257
258
     }

     dev_qp->tx_wq.nwqes = ep->sq_cnt;
+    if (ep->qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
+        dev_qp->rx_wq.nwqes = ep->rq_cnt;
+        dev_qp->rx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->rq_buf_offset);
+        dev_qp->rx_wq.dbrec = (__be32 *)((uintptr_t)ep->dbr_mobject->aligned.gpu_ptr + ep->dbr_offset);
+        dev_qp->rx_wq.bf = (void *)ep->uar_mobject->aligned.gpu_ptr;
+    }

     ibuf_dci_start = (uintptr_t)device->qp_shared_object.internal_buf.mem_object->aligned.gpu_ptr;
     ibuf_rc_start = ibuf_dci_start + (size_per_dci * device->dci.num_eps);
Shangyan Zhou's avatar
Shangyan Zhou committed
259
@@ -2379,6 +2408,9 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
Chenggang Zhao's avatar
Chenggang Zhao committed
260
261
262
263
264
265
266
267
268
     nvshmemi_ibgda_device_cq_t *cq_d = NULL;
     nvshmemi_ibgda_device_cq_t *cq_h = NULL;

+    nvshmemi_ibgda_device_cq_t *recv_cq_d = NULL;
+    nvshmemi_ibgda_device_cq_t *recv_cq_h = NULL;
+
     uint8_t *qp_group_switches_d = NULL;

     const size_t mvars_offset = offsetof(nvshmemi_ibgda_device_qp_t, mvars);
Shangyan Zhou's avatar
Shangyan Zhou committed
269
@@ -2386,6 +2418,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
Chenggang Zhao's avatar
Chenggang Zhao committed
270
271
272
273
274
275
276
     const size_t cons_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.cons_idx);
     const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
     const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
+    const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);

     nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
     nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
Shangyan Zhou's avatar
Shangyan Zhou committed
277
@@ -2421,7 +2454,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
Chenggang Zhao's avatar
Chenggang Zhao committed
278
279
280
281
282
283
284
         num_dct_handles += device->dct.num_eps * n_pes;
         num_dci_handles += device->dci.num_eps;
         num_rc_handles += device->rc.num_eps_per_pe * n_pes;
-        num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1));
+        num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1) * 2);
         num_shared_dci_handles += device->dci.num_shared_eps;
     }
Shangyan Zhou's avatar
Shangyan Zhou committed
285
286
     assert(num_dci_handles - num_shared_dci_handles >= 0);
@@ -2456,6 +2489,10 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
Chenggang Zhao's avatar
Chenggang Zhao committed
287
288
289
290
291
292
293
294
295
296
     for (int i = 0; i < num_cq_handles; i++) {
         nvshmemi_init_ibgda_device_cq(cq_h[i]);
     }
+
+    recv_cq_h = (nvshmemi_ibgda_device_cq_t *)calloc(1, sizeof(*recv_cq_h));
+    NVSHMEMI_NULL_ERROR_JMP(recv_cq_h, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out, "recv_cq calloc err.");
+    nvshmemi_init_ibgda_device_cq(recv_cq_h[0]);
     /* allocate host memory for dct, rc, cq, dci end */

     /* allocate device memory for dct, rc, cq, dci start */
Shangyan Zhou's avatar
Shangyan Zhou committed
297
@@ -2559,6 +2596,14 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
Chenggang Zhao's avatar
Chenggang Zhao committed
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
                 }

                 ++cq_idx;
+
+                rc_h[arr_idx].rx_wq.cq = &cq_d[cq_idx];
+
+                ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
+                cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
+                cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
+                cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
+                ++cq_idx;
             }
         }
     }
--
2.25.1


Shangyan Zhou's avatar
Shangyan Zhou committed
316
From af479f9f23103d4a1579fae38676d6b3022df887 Mon Sep 17 00:00:00 2001
Chenggang Zhao's avatar
Chenggang Zhao committed
317
318
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Sat, 8 Feb 2025 18:02:39 +0800
Shangyan Zhou's avatar
Shangyan Zhou committed
319
Subject: [PATCH 3/4] Maintain recv queue's cons_idx.
Chenggang Zhao's avatar
Chenggang Zhao committed
320
321
322
323
324
325
326

---
 src/include/device_host_transport/nvshmem_common_ibgda.h | 5 +++--
 src/modules/transport/ibgda/ibgda.cpp                    | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
Shangyan Zhou's avatar
Shangyan Zhou committed
327
index 1be3dec..ea1e284 100644
Chenggang Zhao's avatar
Chenggang Zhao committed
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -170,6 +170,7 @@ typedef struct {
     } tx_wq;
     struct {
         uint64_t resv_head;   // last reserved wqe idx + 1
+        uint64_t cons_idx;    // polled wqe idx + 1 (consumer index + 1)
     } rx_wq;
     struct {
         uint64_t head;
@@ -177,7 +178,7 @@ typedef struct {
     } ibuf;
     char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
 } __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
李金梁's avatar
李金梁 committed
343
-              "ibgda_device_qp_management_v1 must be 104 bytes.");
Chenggang Zhao's avatar
Chenggang Zhao committed
344
+static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 112,
李金梁's avatar
李金梁 committed
345
+              "ibgda_device_qp_management_v1 must be 112 bytes.");
Chenggang Zhao's avatar
Chenggang Zhao committed
346
347
348
349
350
351
352

 typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
@@ -214,7 +215,7 @@ typedef struct nvshmemi_ibgda_device_qp {
     } rx_wq;
     nvshmemi_ibgda_device_qp_management_v1 mvars;  // management variables
 } nvshmemi_ibgda_device_qp_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");
GreatHato's avatar
GreatHato committed
353
+static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 256, "ibgda_device_qp_v1 must be 256 bytes.");
Chenggang Zhao's avatar
Chenggang Zhao committed
354
355
356
357

 typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;

diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
Shangyan Zhou's avatar
Shangyan Zhou committed
358
index e0b2d5c..bc339c5 100644
Chenggang Zhao's avatar
Chenggang Zhao committed
359
360
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
Shangyan Zhou's avatar
Shangyan Zhou committed
361
@@ -1067,7 +1067,7 @@ static inline void ibgda_nic_control_free(struct ibgda_mem_object *mobject) {
Chenggang Zhao's avatar
Chenggang Zhao committed
362
363
364
365
366
367
368
369
         ibgda_host_mem_free(mobject);
 }

-static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device) {
+static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device, int cc = 1) {
     int status = 0;

     struct ibgda_cq *gcq = NULL;
Shangyan Zhou's avatar
Shangyan Zhou committed
370
@@ -1118,7 +1118,7 @@ static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device)
Chenggang Zhao's avatar
Chenggang Zhao committed
371
372
373
374
375
376
377
378
     cq_context = DEVX_ADDR_OF(create_cq_in, cmd_in, cq_context);
     DEVX_SET(cqc, cq_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);
     DEVX_SET(cqc, cq_context, cqe_sz, MLX5_CQE_SIZE_64B);
-    DEVX_SET(cqc, cq_context, cc, 0x1);  // Use collapsed CQ
+    DEVX_SET(cqc, cq_context, cc, cc);  // Use collapsed CQ
     DEVX_SET(cqc, cq_context, oi, 0x1);  // Allow overrun
     DEVX_SET(cqc, cq_context, dbr_umem_id, dbr_umem->umem_id);
     DEVX_SET(cqc, cq_context, log_cq_size, IBGDA_ILOG2_OR0(num_cqe));
Shangyan Zhou's avatar
Shangyan Zhou committed
379
@@ -2419,6 +2419,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
Chenggang Zhao's avatar
Chenggang Zhao committed
380
381
382
383
384
385
386
     const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
     const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
     const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
+    const size_t rx_cons_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.cons_idx);

     nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
     nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
Shangyan Zhou's avatar
Shangyan Zhou committed
387
@@ -2601,6 +2602,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
Chenggang Zhao's avatar
Chenggang Zhao committed
388
389
390
391
392
393
394
395
396
397
398

                 ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
                 cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
+                cq_h[cq_idx].cons_idx = (uint64_t *)(base_mvars_d_addr + rx_cons_offset);
                 cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
                 cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
                 ++cq_idx;
--
2.25.1


Shangyan Zhou's avatar
Shangyan Zhou committed
399
From e0ba3fa21b4b633b481c6684c3ad04f2670c8df4 Mon Sep 17 00:00:00 2001
Chenggang Zhao's avatar
Chenggang Zhao committed
400
401
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Tue, 11 Feb 2025 11:00:57 +0800
Shangyan Zhou's avatar
Shangyan Zhou committed
402
Subject: [PATCH 4/4] Init rx_wq counters.
Chenggang Zhao's avatar
Chenggang Zhao committed
403
404
405
406
407
408

---
 src/include/device_host_transport/nvshmem_common_ibgda.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
Shangyan Zhou's avatar
Shangyan Zhou committed
409
index ea1e284..e6640d6 100644
Chenggang Zhao's avatar
Chenggang Zhao committed
410
411
412
413
414
415
416
417
418
419
420
421
422
423
--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -46,6 +46,8 @@
         qp_man.tx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
         qp_man.tx_wq.get_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
         qp_man.tx_wq.get_tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+        qp_man.rx_wq.resv_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
+        qp_man.rx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                    \
         qp_man.ibuf.head = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                         \
         qp_man.ibuf.tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID;                         \
     } while (0);
--
2.25.1

424
425
426
427
428
429
430
431
432
433
434
435
436
437
diff --git a/src/modules/transport/common/transport_ib_common.cpp b/src/modules/transport/common/transport_ib_common.cpp
index c89f408..f99018a 100644
--- a/src/modules/transport/common/transport_ib_common.cpp
+++ b/src/modules/transport/common/transport_ib_common.cpp
@@ -26,6 +26,9 @@ int nvshmemt_ib_common_nv_peer_mem_available() {
     if (access("/sys/kernel/mm/memory_peers/nvidia-peermem/version", F_OK) == 0) {
         return NVSHMEMX_SUCCESS;
     }
+    if (access("/sys/module/nvidia_peermem/version", F_OK) == 0) {
+        return NVSHMEMX_SUCCESS;
+    }
 
     return NVSHMEMX_ERROR_INTERNAL;
 }
youkaichao's avatar
youkaichao committed
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474


From 099f608fcd9a1d34c866ad75d0af5d02d2020374 Mon Sep 17 00:00:00 2001
From: Kaichao You <youkaichao@gmail.com>
Date: Tue, 10 Jun 2025 00:35:03 -0700
Subject: [PATCH] remove gdrcopy dependency

---
 src/modules/transport/ibgda/ibgda.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index ef325cd..16ee09c 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -406,6 +406,7 @@ static size_t ibgda_get_host_page_size() {
     return host_page_size;
 }

+#ifdef NVSHMEM_USE_GDRCOPY
 int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
     nvshmemt_ibgda_state_t *ibgda_state = (nvshmemt_ibgda_state_t *)t->state;
     int n_devs_selected = ibgda_state->n_devs_selected;
@@ -459,6 +460,11 @@ int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
     }
     return 0;
 }
+#else
+int nvshmemt_ibgda_progress(nvshmem_transport_t t) {
+    return NVSHMEMX_ERROR_NOT_SUPPORTED;
+}
+#endif

 int nvshmemt_ibgda_show_info(struct nvshmem_transport *transport, int style) {
     NVSHMEMI_ERROR_PRINT("ibgda show info not implemented");
--
2.34.1