Unverified Commit df4debe3 authored by Shangyan Zhou's avatar Shangyan Zhou Committed by GitHub
Browse files

Reduce NVSHMEM gpu memory usage and disable MNNVL. (#190)


Co-authored-by: default avatarShangyan Zhou <sy.zhou@deepseek.com>
parent d8dd185c
...@@ -79,9 +79,18 @@ class Buffer: ...@@ -79,9 +79,18 @@ class Buffer:
os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}' os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}'
# Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check # Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
os.environ['NVSHMEM_QP_DEPTH'] = '1024' os.environ['NVSHMEM_QP_DEPTH'] = '1024'
# Reduce gpu memory usage
# 6 default teams + 1 extra team
os.environ['NVSHMEM_MAX_TEAMS'] = '7'
# Disable NVLink SHArP
os.environ['NVSHMEM_DISABLE_NVLS'] = '1'
# NOTES: NVSHMEM initialization requires at least 256 MiB # NOTES: NVSHMEM initialization requires at least 256 MiB
os.environ['NVSHMEM_CUMEM_GRANULARITY'] = f'{2 ** 29}' os.environ['NVSHMEM_CUMEM_GRANULARITY'] = f'{2 ** 29}'
# Disable multi-node NVLink detection
os.environ['NVSHMEM_DISABLE_MNNVL'] = '1'
# Synchronize using the root ID # Synchronize using the root ID
nvshmem_unique_ids = [None, ] * self.group_size nvshmem_unique_ids = [None, ] * self.group_size
if (low_latency_mode and self.rank == 0) or (not low_latency_mode and self.runtime.get_rdma_rank() == 0): if (low_latency_mode and self.rank == 0) or (not low_latency_mode and self.runtime.get_rdma_rank() == 0):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment