1. 修复使用函数获取num_nvl_bytes, num_rdma_bytes变量的bug.

2. 修改测试脚本，降低显存占用。使用量从17G -> 8G. Signed-off-by: lijian <lijian6@sugon.com>

1. 修复使用函数获取num_nvl_bytes, num_rdma_bytes变量的bug.
2. 修改测试脚本，降低显存占用。使用量从17G -> 8G. Signed-off-by: lijian <lijian6@sugon.com>
0b14d3b2 · lijian6 · 447238de · 0b14d3b2 · 0b14d3b2 · 0b14d3b2
Commit 0b14d3b2 authored Oct 24, 2025 by lijian6
Hide whitespace changes
Inline Side-by-side

Showing with 27 additions and 11 deletions

1.sh 1.sh +3 -2

2.sh 2.sh +3 -2

deep_ep/buffer.py deep_ep/buffer.py +5 -3

setup.py setup.py +4 -1

tests/test_internode.py tests/test_internode.py +12 -3

No files found.
--- a/1.sh
+++ b/1.sh
@@ -6,8 +6,9 @@ export UCX_TLS=rc,rocm
 export OMPI_MCA_rmaps_base_mapping_policy="slot:numa"
 export ROCSHMEM_MAX_NUM_CONTEXTS=32 
 export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
-export UCX_NET_DEVICES=mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1 
+export UCX_NET_DEVICES=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export ROCSHMEM_HEAP_SIZE=10737418240
+# export ROCSHMEM_HEAP_SIZE=536870912 805306368 10737418240
+export ROCSHMEM_HEAP_SIZE=536870912
 export PYTHONPATH=/public/home/lishen/Tmp/DeepEP:$PYTHONPATH
 torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
--- a/2.sh
+++ b/2.sh
@@ -6,8 +6,9 @@ export UCX_TLS=rc,rocm
 export OMPI_MCA_rmaps_base_mapping_policy="slot:numa"
 export ROCSHMEM_MAX_NUM_CONTEXTS=32 
 export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
-export UCX_NET_DEVICES=mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1 
+export UCX_NET_DEVICES=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export ROCSHMEM_HEAP_SIZE=10737418240
+# export ROCSHMEM_HEAP_SIZE=536870912 805306368 10737418240
+export ROCSHMEM_HEAP_SIZE=536870912
 export PYTHONPATH=/public/home/lishen/Tmp/DeepEP:$PYTHONPATH
 torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -27,7 +27,7 @@ class Buffer:
        runtime: the C++ runtime.
    """
-    num_sms: int = 20
+    num_sms: int = 24
    def __init__(
        self,
@@ -253,7 +253,8 @@ class Buffer:
            2: Config(Buffer.num_sms, 24, 256, 6, 128),
            4: Config(Buffer.num_sms, 6, 256, 6, 128),
            8: Config(Buffer.num_sms, 6, 256, 6, 128),
-            16: Config(Buffer.num_sms, 36, 288, 20, 128),
+            # 16: Config(Buffer.num_sms, 36, 288, 20, 128),
+            16: Config(Buffer.num_sms, 8, 512, 16, 128),
            24: Config(Buffer.num_sms, 8, 288, 32, 128),
            32: Config(Buffer.num_sms, 32, 288, 32, 128),
            64: Config(Buffer.num_sms, 20, 288, 28, 128),
@@ -281,7 +282,8 @@ class Buffer:
            2: Config(Buffer.num_sms, 10, 256, 6, 128),
            4: Config(Buffer.num_sms, 9, 256, 6, 128),
            8: Config(Buffer.num_sms, 4, 256, 6, 128),
-            16: Config(Buffer.num_sms, 4, 288, 12, 128),
+            # 16: Config(Buffer.num_sms, 4, 288, 12, 128),
+            16: Config(Buffer.num_sms, 8, 512, 16, 128),
            24: Config(Buffer.num_sms, 1, 288, 8, 128),
            32: Config(Buffer.num_sms, 1, 288, 8, 128),
            64: Config(Buffer.num_sms, 1, 288, 20, 128),

--- a/setup.py
+++ b/setup.py
@@ -2,6 +2,9 @@ import os
 import subprocess
 import setuptools
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+from datetime import datetime
+date_tag = datetime.now().strftime("%Y%m%d")
 if __name__ == '__main__':
    try:
@@ -12,7 +15,7 @@ if __name__ == '__main__':
    setuptools.setup(
        name='deep_ep',
-        version='1.0.0' + revision,
+        version='1.0.0' + revision + '.' + date_tag,
        packages=setuptools.find_packages(include=['deep_ep']),
        include_package_data=True,
        package_data={"deep_ep": ["deep_ep_cpp.cpython-310-x86_64-linux-gnu.so"]},

--- a/tests/test_internode.py
+++ b/tests/test_internode.py
@@ -193,7 +193,6 @@ def test_main(args: argparse.Namespace, num_sms: int,
    if skip_benchmark:
        return hash_value
-    # print("benchmark start:")
    # Tune dispatch performance
    best_dispatch_results = None
    fp8_factor = (1 + 4 / 128) / 2
@@ -253,6 +252,10 @@ def test_main(args: argparse.Namespace, num_sms: int,
        print('', flush=True)
    return hash_value
+def get_hidden_bytes(args: argparse.Namespace) -> int:
+    x = torch.ones((args.num_tokens, args.hidden), dtype=torch.bfloat16)
+    t = x[0] if isinstance(x, tuple) else x
+    return t.size(1) * max(t.element_size(), 2)
 # noinspection PyUnboundLocalVariable,PyShadowingNames
 def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
@@ -261,10 +264,16 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
    if args.test_ll_compatibility:
        ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
-    num_sms = 30
+    num_sms = 24
    num_qps_per_rank = max(num_sms, ll_num_experts // num_ranks if args.test_ll_compatibility else 0)
-    buffer = deep_ep.Buffer(group, int(2e9), int(1e9), low_latency_mode=args.test_ll_compatibility,
+    hidden_bytes = get_hidden_bytes(args)
+    num_nvl_bytes, num_rdma_bytes = 0, 0
+    for config in (deep_ep.Buffer.get_dispatch_config(group.size()), deep_ep.Buffer.get_combine_config(group.size())):
+        num_nvl_bytes = max(config.get_nvl_buffer_size_hint(hidden_bytes, group.size()), num_nvl_bytes)
+        num_rdma_bytes = max(config.get_rdma_buffer_size_hint(hidden_bytes, group.size()), num_rdma_bytes)
+    buffer = deep_ep.Buffer(group, num_nvl_bytes, num_rdma_bytes, low_latency_mode=args.test_ll_compatibility,
                            num_qps_per_rank=num_qps_per_rank, explicitly_destroy=True, use_default_stream_as_comm_stream=False)
    assert num_local_ranks == 8 and num_ranks > 8