Reduce NVSHMEM gpu memory usage and disable MNNVL. (#190)

Co-authored-by: Shangyan Zhou <sy.zhou@deepseek.com>

Reduce NVSHMEM gpu memory usage and disable MNNVL. (#190)
Co-authored-by: Shangyan Zhou <sy.zhou@deepseek.com>
df4debe3 · Shangyan Zhou · GitHub · d8dd185c · df4debe3
Unverified Commit df4debe3 authored Jun 06, 2025 by Shangyan Zhou Committed by GitHub Jun 06, 2025
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 0 deletions

deep_ep/buffer.py deep_ep/buffer.py +9 -0

No files found.
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -79,9 +79,18 @@ class Buffer:
            os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}'
            # Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
            os.environ['NVSHMEM_QP_DEPTH'] = '1024'
+
+            # Reduce gpu memory usage
+            # 6 default teams + 1 extra team
+            os.environ['NVSHMEM_MAX_TEAMS'] = '7'
+            # Disable NVLink SHArP
+            os.environ['NVSHMEM_DISABLE_NVLS'] = '1'
            # NOTES: NVSHMEM initialization requires at least 256 MiB
            os.environ['NVSHMEM_CUMEM_GRANULARITY'] = f'{2 ** 29}'

+            # Disable multi-node NVLink detection
+            os.environ['NVSHMEM_DISABLE_MNNVL'] = '1'
+
            # Synchronize using the root ID
            nvshmem_unique_ids = [None, ] * self.group_size
            if (low_latency_mode and self.rank == 0) or (not low_latency_mode and self.runtime.get_rdma_rank() == 0):