feat(test):Add some test opt.

Signed-off-by: lijian <lijian6@sugon.com>

feat(test):Add some test opt.
Signed-off-by: lijian <lijian6@sugon.com>
340c3f01 · lijian6 · 4fb1dabc · 340c3f01 · 340c3f01 · 340c3f01
Commit 340c3f01 authored Jan 29, 2026 by lijian6
7 changed files
--- a/1.sh
+++ b/1.sh
-pgrep -f /usr/bin/python | xargs kill -9

-export OMPI_MCA_pml=ucx
-export OMPI_MCA_osc=ucx
-export OMPI_MCA_coll_hcoll_enable=0
-export UCX_TLS=rc,rocm
-# export ROCSHMEM_UNIQUEID_WITH_MPI=1
-export OMPI_MCA_rmaps_base_mapping_policy="slot:numa"
+# rocSHMEM
 export ROCSHMEM_GDA_NUM_QPS_DEFAULT_CTX=288
 export ROCSHMEM_MAX_NUM_CONTEXTS=48
-export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
-export UCX_NET_DEVICES=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
 export ROCSHMEM_ALLOWED_IBV_DEVICES=mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-# export ROCSHMEM_HEAP_SIZE=536870912 805306368 10737418240
 export ROCSHMEM_HEAP_SIZE=10737418240
+export ROCSHMEM_TOPO_FILE_FORCE=tests/topo.config
+
+# duSHMEM
+export LD_LIBRARY_PATH=/opt/dtk/dushmem/lib:$LD_LIBRARY_PATH
+export DEEP_EP_DEVICE_TO_HCA_MAPPING=0:mlx5_2:1,1:mlx5_3:1,2:mlx5_4:1,3:mlx5_5:1,4:mlx5_6:1,5:mlx5_7:1,6:mlx5_8:1,7:mlx5_9:1
+export NVSHMEM_SYMMETRIC_SIZE=10737418240
+
+# common
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export PYTHONPATH=$(pwd)
+
+# test
 # torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
 # torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency.py
-# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency_new.py --pressure-test
-torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py --test-ll-compatibility
+torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency_new.py --pressure-test
+# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py --test-ll-compatibility
--- a/2.sh
+++ b/2.sh
-pgrep -f /usr/bin/python | xargs kill -9 

-export OMPI_MCA_pml=ucx
-export OMPI_MCA_osc=ucx
-export OMPI_MCA_coll_hcoll_enable=0
-export UCX_TLS=rc,rocm
-# export ROCSHMEM_UNIQUEID_WITH_MPI=1
-export OMPI_MCA_rmaps_base_mapping_policy="slot:numa"
+# rocSHMEM
 export ROCSHMEM_GDA_NUM_QPS_DEFAULT_CTX=288
 export ROCSHMEM_MAX_NUM_CONTEXTS=48
-export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
-export UCX_NET_DEVICES=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
 export ROCSHMEM_ALLOWED_IBV_DEVICES=mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-# export ROCSHMEM_HEAP_SIZE=536870912 805306368 10737418240
 export ROCSHMEM_HEAP_SIZE=10737418240
+export ROCSHMEM_TOPO_FILE_FORCE=tests/topo.config
+
+# duSHMEM
+export LD_LIBRARY_PATH=/opt/dtk/dushmem/lib:$LD_LIBRARY_PATH
+export DEEP_EP_DEVICE_TO_HCA_MAPPING=0:mlx5_2:1,1:mlx5_3:1,2:mlx5_4:1,3:mlx5_5:1,4:mlx5_6:1,5:mlx5_7:1,6:mlx5_8:1,7:mlx5_9:1
+export NVSHMEM_SYMMETRIC_SIZE=10737418240
+
+# common
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export PYTHONPATH=$(pwd)
+
+# test
 # torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
 # torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency.py
-# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency_new.py --pressure-test
-torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py --test-ll-compatibility
+torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency_new.py --pressure-test
+# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py --test-ll-compatibility
--- a/deep_ep/deepep_InterConfig.json
+++ b/deep_ep/deepep_InterConfig.json
+{
+  "normal_dispatch": {
+    "num_sms": 48,
+    "num_max_nvl_chunked_send_tokens": 30,
+    "num_max_nvl_chunked_recv_tokens": 512,
+    "num_max_rdma_chunked_send_tokens": 32,
+    "num_max_rdma_chunked_recv_tokens": 128
+  },
+  "normal_combine": {
+    "num_sms": 48,
+    "num_max_nvl_chunked_send_tokens": 2,
+    "num_max_nvl_chunked_recv_tokens": 512,
+    "num_max_rdma_chunked_send_tokens": 32,
+    "num_max_rdma_chunked_recv_tokens": 128
+  }
+}
--- a/deep_ep/deepep_IntraConfig.json
+++ b/deep_ep/deepep_IntraConfig.json
+{
+  "normal_dispatch": {
+    "num_sms": 64,
+    "num_max_nvl_chunked_send_tokens": 4,
+    "num_max_nvl_chunked_recv_tokens": 256
+  },
+  "normal_combine": {
+    "num_sms": 64,
+    "num_max_nvl_chunked_send_tokens": 4,
+    "num_max_nvl_chunked_recv_tokens": 256
+  }
+}
--- a/tests/test_internode.py
+++ b/tests/test_internode.py
@@ -265,7 +265,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
        ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
        num_rdma_bytes_ll = deep_ep.Buffer.get_low_latency_rdma_size_hint(ll_num_tokens, ll_hidden, num_ranks, ll_num_experts)

-    num_sms = 30
+    num_sms = 48
    num_qps_per_rank = max(num_sms, ll_num_experts // num_ranks if args.test_ll_compatibility else 0)

    hidden_bytes = get_hidden_bytes(args)

--- a/tests/test_intranode.py
+++ b/tests/test_intranode.py
@@ -25,8 +25,8 @@ def test_main(args: argparse.Namespace, num_sms: int, local_rank: int, num_ranks
    # Random data
    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * rank
    x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
-    x_e4m3 = None # per_token_cast_to_fp8(x) if deep_ep.Buffer.is_sm90_compiled() else None
-    x_e4m3 = None # (x_e4m3[0], x_e4m3[1].T.contiguous().T) if x_e4m3 is not None else None
+    x_e4m3 = per_token_cast_to_fp8(x)
+    x_e4m3 = (x_e4m3[0], x_e4m3[1].T.contiguous().T) if x_e4m3 is not None else None
    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)[1]
    # topk_idx = topk_idx.to(deep_ep.topk_idx_t)

--- a/tests/topo.config
+++ b/tests/topo.config
+0000:9f:00.0 mlx5_2 2
+0000:56:00.0 mlx5_3 3
+0000:5d:00.0 mlx5_4 4
+0000:05:00.0 mlx5_5 5
+0000:e5:00.0 mlx5_6 6
+0000:c1:00.0 mlx5_7 7
+0000:ca:00.0 mlx5_8 8
+0000:b1:00.0 mlx5_9 9
\ No newline at end of file