Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
DeepEP
Commits
340c3f01
Commit
340c3f01
authored
Jan 29, 2026
by
lijian6
Browse files
feat(test):Add some test opt.
Signed-off-by:
lijian
<
lijian6@sugon.com
>
parent
4fb1dabc
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
69 additions
and
29 deletions
+69
-29
1.sh
1.sh
+15
-13
2.sh
2.sh
+15
-13
deep_ep/deepep_InterConfig.json
deep_ep/deepep_InterConfig.json
+16
-0
deep_ep/deepep_IntraConfig.json
deep_ep/deepep_IntraConfig.json
+12
-0
tests/test_internode.py
tests/test_internode.py
+1
-1
tests/test_intranode.py
tests/test_intranode.py
+2
-2
tests/topo.config
tests/topo.config
+8
-0
No files found.
1.sh
View file @
340c3f01
pgrep
-f
/usr/bin/python | xargs
kill
-9
export
OMPI_MCA_pml
=
ucx
# rocSHMEM
export
OMPI_MCA_osc
=
ucx
export
OMPI_MCA_coll_hcoll_enable
=
0
export
UCX_TLS
=
rc,rocm
# export ROCSHMEM_UNIQUEID_WITH_MPI=1
export
OMPI_MCA_rmaps_base_mapping_policy
=
"slot:numa"
export
ROCSHMEM_GDA_NUM_QPS_DEFAULT_CTX
=
288
export
ROCSHMEM_GDA_NUM_QPS_DEFAULT_CTX
=
288
export
ROCSHMEM_MAX_NUM_CONTEXTS
=
48
export
ROCSHMEM_MAX_NUM_CONTEXTS
=
48
export
UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS
=
16384
export
UCX_NET_DEVICES
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
ROCSHMEM_ALLOWED_IBV_DEVICES
=
mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9
export
ROCSHMEM_ALLOWED_IBV_DEVICES
=
mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# export ROCSHMEM_HEAP_SIZE=536870912 805306368 10737418240
export
ROCSHMEM_HEAP_SIZE
=
10737418240
export
ROCSHMEM_HEAP_SIZE
=
10737418240
export
ROCSHMEM_TOPO_FILE_FORCE
=
tests/topo.config
# duSHMEM
export
LD_LIBRARY_PATH
=
/opt/dtk/dushmem/lib:
$LD_LIBRARY_PATH
export
DEEP_EP_DEVICE_TO_HCA_MAPPING
=
0:mlx5_2:1,1:mlx5_3:1,2:mlx5_4:1,3:mlx5_5:1,4:mlx5_6:1,5:mlx5_7:1,6:mlx5_8:1,7:mlx5_9:1
export
NVSHMEM_SYMMETRIC_SIZE
=
10737418240
# common
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
PYTHONPATH
=
$(
pwd
)
# test
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency.py
#
torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency_new.py --pressure-test
torchrun
--nproc-per-node
=
1
--nnodes
=
2
--node-rank
=
0
--master-addr
=
"10.16.1.37"
--master-port
=
1234 tests/test_low_latency_new.py
--pressure-test
torchrun
--nproc-per-node
=
1
--nnodes
=
2
--node-rank
=
0
--master-addr
=
"10.16.1.37"
--master-port
=
1234 tests/test_internode.py
--test-ll-compatibility
#
torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py --test-ll-compatibility
2.sh
View file @
340c3f01
pgrep
-f
/usr/bin/python | xargs
kill
-9
export
OMPI_MCA_pml
=
ucx
# rocSHMEM
export
OMPI_MCA_osc
=
ucx
export
OMPI_MCA_coll_hcoll_enable
=
0
export
UCX_TLS
=
rc,rocm
# export ROCSHMEM_UNIQUEID_WITH_MPI=1
export
OMPI_MCA_rmaps_base_mapping_policy
=
"slot:numa"
export
ROCSHMEM_GDA_NUM_QPS_DEFAULT_CTX
=
288
export
ROCSHMEM_GDA_NUM_QPS_DEFAULT_CTX
=
288
export
ROCSHMEM_MAX_NUM_CONTEXTS
=
48
export
ROCSHMEM_MAX_NUM_CONTEXTS
=
48
export
UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS
=
16384
export
UCX_NET_DEVICES
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
ROCSHMEM_ALLOWED_IBV_DEVICES
=
mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9
export
ROCSHMEM_ALLOWED_IBV_DEVICES
=
mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# export ROCSHMEM_HEAP_SIZE=536870912 805306368 10737418240
export
ROCSHMEM_HEAP_SIZE
=
10737418240
export
ROCSHMEM_HEAP_SIZE
=
10737418240
export
ROCSHMEM_TOPO_FILE_FORCE
=
tests/topo.config
# duSHMEM
export
LD_LIBRARY_PATH
=
/opt/dtk/dushmem/lib:
$LD_LIBRARY_PATH
export
DEEP_EP_DEVICE_TO_HCA_MAPPING
=
0:mlx5_2:1,1:mlx5_3:1,2:mlx5_4:1,3:mlx5_5:1,4:mlx5_6:1,5:mlx5_7:1,6:mlx5_8:1,7:mlx5_9:1
export
NVSHMEM_SYMMETRIC_SIZE
=
10737418240
# common
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
PYTHONPATH
=
$(
pwd
)
# test
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency.py
#
torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_low_latency_new.py --pressure-test
torchrun
--nproc-per-node
=
1
--nnodes
=
2
--node-rank
=
1
--master-addr
=
"10.16.1.37"
--master-port
=
1234 tests/test_low_latency_new.py
--pressure-test
torchrun
--nproc-per-node
=
1
--nnodes
=
2
--node-rank
=
1
--master-addr
=
"10.16.1.37"
--master-port
=
1234 tests/test_internode.py
--test-ll-compatibility
#
torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py --test-ll-compatibility
deep_ep/deepep_InterConfig.json
0 → 100644
View file @
340c3f01
{
"normal_dispatch"
:
{
"num_sms"
:
48
,
"num_max_nvl_chunked_send_tokens"
:
30
,
"num_max_nvl_chunked_recv_tokens"
:
512
,
"num_max_rdma_chunked_send_tokens"
:
32
,
"num_max_rdma_chunked_recv_tokens"
:
128
},
"normal_combine"
:
{
"num_sms"
:
48
,
"num_max_nvl_chunked_send_tokens"
:
2
,
"num_max_nvl_chunked_recv_tokens"
:
512
,
"num_max_rdma_chunked_send_tokens"
:
32
,
"num_max_rdma_chunked_recv_tokens"
:
128
}
}
deep_ep/deepep_IntraConfig.json
0 → 100644
View file @
340c3f01
{
"normal_dispatch"
:
{
"num_sms"
:
64
,
"num_max_nvl_chunked_send_tokens"
:
4
,
"num_max_nvl_chunked_recv_tokens"
:
256
},
"normal_combine"
:
{
"num_sms"
:
64
,
"num_max_nvl_chunked_send_tokens"
:
4
,
"num_max_nvl_chunked_recv_tokens"
:
256
}
}
tests/test_internode.py
View file @
340c3f01
...
@@ -265,7 +265,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
...
@@ -265,7 +265,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
ll_num_tokens
,
ll_hidden
,
ll_num_experts
,
ll_num_topk
=
16
,
5120
,
256
,
9
ll_num_tokens
,
ll_hidden
,
ll_num_experts
,
ll_num_topk
=
16
,
5120
,
256
,
9
num_rdma_bytes_ll
=
deep_ep
.
Buffer
.
get_low_latency_rdma_size_hint
(
ll_num_tokens
,
ll_hidden
,
num_ranks
,
ll_num_experts
)
num_rdma_bytes_ll
=
deep_ep
.
Buffer
.
get_low_latency_rdma_size_hint
(
ll_num_tokens
,
ll_hidden
,
num_ranks
,
ll_num_experts
)
num_sms
=
30
num_sms
=
48
num_qps_per_rank
=
max
(
num_sms
,
ll_num_experts
//
num_ranks
if
args
.
test_ll_compatibility
else
0
)
num_qps_per_rank
=
max
(
num_sms
,
ll_num_experts
//
num_ranks
if
args
.
test_ll_compatibility
else
0
)
hidden_bytes
=
get_hidden_bytes
(
args
)
hidden_bytes
=
get_hidden_bytes
(
args
)
...
...
tests/test_intranode.py
View file @
340c3f01
...
@@ -25,8 +25,8 @@ def test_main(args: argparse.Namespace, num_sms: int, local_rank: int, num_ranks
...
@@ -25,8 +25,8 @@ def test_main(args: argparse.Namespace, num_sms: int, local_rank: int, num_ranks
# Random data
# Random data
x
=
torch
.
ones
((
num_tokens
,
hidden
),
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
*
rank
x
=
torch
.
ones
((
num_tokens
,
hidden
),
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
*
rank
x_pure_rand
=
torch
.
randn
((
num_tokens
,
hidden
),
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
x_pure_rand
=
torch
.
randn
((
num_tokens
,
hidden
),
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
x_e4m3
=
None
#
per_token_cast_to_fp8(x)
if deep_ep.Buffer.is_sm90_compiled() else None
x_e4m3
=
per_token_cast_to_fp8
(
x
)
x_e4m3
=
None
#
(x_e4m3[0], x_e4m3[1].T.contiguous().T) if x_e4m3 is not None else None
x_e4m3
=
(
x_e4m3
[
0
],
x_e4m3
[
1
].
T
.
contiguous
().
T
)
if
x_e4m3
is
not
None
else
None
scores
=
torch
.
randn
((
num_tokens
,
num_experts
),
dtype
=
torch
.
float32
,
device
=
'cuda'
).
abs
()
+
1
scores
=
torch
.
randn
((
num_tokens
,
num_experts
),
dtype
=
torch
.
float32
,
device
=
'cuda'
).
abs
()
+
1
topk_idx
=
torch
.
topk
(
scores
,
num_topk
,
dim
=-
1
,
largest
=
True
,
sorted
=
False
)[
1
]
topk_idx
=
torch
.
topk
(
scores
,
num_topk
,
dim
=-
1
,
largest
=
True
,
sorted
=
False
)[
1
]
# topk_idx = topk_idx.to(deep_ep.topk_idx_t)
# topk_idx = topk_idx.to(deep_ep.topk_idx_t)
...
...
tests/topo.config
0 → 100644
View file @
340c3f01
0000
:
9
f
:
00
.
0
mlx5_2
2
0000
:
56
:
00
.
0
mlx5_3
3
0000
:
5
d
:
00
.
0
mlx5_4
4
0000
:
05
:
00
.
0
mlx5_5
5
0000
:
e5
:
00
.
0
mlx5_6
6
0000
:
c1
:
00
.
0
mlx5_7
7
0000
:
ca
:
00
.
0
mlx5_8
8
0000
:
b1
:
00
.
0
mlx5_9
9
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment