Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e065eba
Unverified
Commit
7e065eba
authored
Dec 21, 2025
by
Lucas Wilkinson
Committed by
GitHub
Dec 22, 2025
Browse files
[CI] Fix "2 Node Tests (4 GPUs in total)" (#31090)
Signed-off-by:
Lucas Wilkinson
<
lwilkins@redhat.com
>
parent
9d701e90
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
51 additions
and
27 deletions
+51
-27
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+2
-2
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+2
-2
.buildkite/test_areas/distributed.yaml
.buildkite/test_areas/distributed.yaml
+1
-1
examples/offline_inference/data_parallel.py
examples/offline_inference/data_parallel.py
+46
-22
No files found.
.buildkite/test-amd.yaml
View file @
7e065eba
...
@@ -1254,13 +1254,13 @@ steps:
...
@@ -1254,13 +1254,13 @@ steps:
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
nodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
dp-num-
nodes=2 --
dp-
node-rank=0 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
nodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
dp-num-
nodes=2 --
dp-
node-rank=1 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code
-
label
:
Distributed Tests (2 GPUs)
# 68min
-
label
:
Distributed Tests (2 GPUs)
# 68min
timeout_in_minutes
:
90
timeout_in_minutes
:
90
...
...
.buildkite/test-pipeline.yaml
View file @
7e065eba
...
@@ -1109,13 +1109,13 @@ steps:
...
@@ -1109,13 +1109,13 @@ steps:
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
nodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
dp-num-
nodes=2 --
dp-
node-rank=0 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
nodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
dp-num-
nodes=2 --
dp-
node-rank=1 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code
-
label
:
Distributed Tests (2 GPUs)
# 68min
-
label
:
Distributed Tests (2 GPUs)
# 68min
timeout_in_minutes
:
90
timeout_in_minutes
:
90
...
...
.buildkite/test_areas/distributed.yaml
View file @
7e065eba
...
@@ -171,7 +171,7 @@ steps:
...
@@ -171,7 +171,7 @@ steps:
-
tests/distributed/
-
tests/distributed/
-
tests/examples/offline_inference/data_parallel.py
-
tests/examples/offline_inference/data_parallel.py
commands
:
commands
:
-
./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
nodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
nodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
-
./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
dp-num-
nodes=2 --
dp-
node-rank=0 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
dp-num-
nodes=2 --
dp-
node-rank=1 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code"
-
label
:
Distributed NixlConnector PD accuracy (4 GPUs)
-
label
:
Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes
:
30
timeout_in_minutes
:
30
...
...
examples/offline_inference/data_parallel.py
View file @
7e065eba
...
@@ -14,19 +14,19 @@ Multi-node:
...
@@ -14,19 +14,19 @@ Multi-node:
--model="ibm-research/PowerMoE-3b"
\
--model="ibm-research/PowerMoE-3b"
\
-dp=2
\
-dp=2
\
-tp=2
\
-tp=2
\
--
n
nodes=2
\
--
dp-num-
nodes=2
\
--node-rank=0
\
--
dp-
node-rank=0
\
--master-addr=10.99.48.128
\
--
dp-
master-addr=10.99.48.128
\
--master-port=13345
--
dp-
master-port=13345
Node 1:
Node 1:
python examples/offline_inference/data_parallel.py
\
python examples/offline_inference/data_parallel.py
\
--model="ibm-research/PowerMoE-3b"
\
--model="ibm-research/PowerMoE-3b"
\
-dp=2
\
-dp=2
\
-tp=2
\
-tp=2
\
--
n
nodes=2
\
--
dp-num-
nodes=2
\
--node-rank=1
\
--
dp-
node-rank=1
\
--master-addr=10.99.48.128
\
--
dp-
master-addr=10.99.48.128
\
--master-port=13345
--
dp-
master-port=13345
"""
"""
import
os
import
os
...
@@ -48,7 +48,31 @@ def create_parser():
...
@@ -48,7 +48,31 @@ def create_parser():
enable_expert_parallel
=
True
,
enable_expert_parallel
=
True
,
)
)
# Add timeout (not in EngineArgs)
# Add DP-specific args (separate from engine args to avoid conflicts)
parser
.
add_argument
(
"--dp-num-nodes"
,
type
=
int
,
default
=
1
,
help
=
"Total number of nodes for data parallel."
,
)
parser
.
add_argument
(
"--dp-node-rank"
,
type
=
int
,
default
=
0
,
help
=
"Rank of the current node for data parallel."
,
)
parser
.
add_argument
(
"--dp-master-addr"
,
type
=
str
,
default
=
""
,
help
=
"Master node IP address for DP coordination."
,
)
parser
.
add_argument
(
"--dp-master-port"
,
type
=
int
,
default
=
0
,
help
=
"Master node port for DP coordination."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--timeout"
,
"--timeout"
,
type
=
int
,
type
=
int
,
...
@@ -132,26 +156,26 @@ if __name__ == "__main__":
...
@@ -132,26 +156,26 @@ if __name__ == "__main__":
parser
=
create_parser
()
parser
=
create_parser
()
args
=
vars
(
parser
.
parse_args
())
args
=
vars
(
parser
.
parse_args
())
# Extract DP-specific args
# Extract DP-specific args
(pop to remove from engine_args)
dp_size
=
args
.
pop
(
"data_parallel_size"
)
dp_size
=
args
.
pop
(
"data_parallel_size"
)
n
nodes
=
args
.
get
(
"n
nodes"
,
1
)
dp_num_
nodes
=
args
.
pop
(
"dp_num_
nodes"
)
node_rank
=
args
.
get
(
"
node_rank"
,
0
)
dp_
node_rank
=
args
.
pop
(
"dp_
node_rank"
)
master_addr
=
args
.
get
(
"
master_addr"
,
""
)
dp_
master_addr
=
args
.
pop
(
"dp_
master_addr"
)
master_port
=
args
.
get
(
"
master_port"
,
0
)
dp_
master_port
=
args
.
pop
(
"dp_
master_port"
)
timeout
=
args
.
pop
(
"timeout"
)
timeout
=
args
.
pop
(
"timeout"
)
# Remaining args are engine args
# Remaining args are engine args
engine_args
=
args
engine_args
=
args
if
n
nodes
==
1
:
if
dp_num_
nodes
==
1
:
dp_master_ip
=
"127.0.0.1"
dp_master_ip
=
"127.0.0.1"
dp_master_port
=
get_open_port
()
dp_master_port
_val
=
get_open_port
()
else
:
else
:
dp_master_ip
=
master_addr
dp_master_ip
=
dp_
master_addr
dp_master_port
=
master_port
dp_master_port
_val
=
dp_
master_port
assert
dp_size
%
n
nodes
==
0
,
"dp_size should be divisible by
n
nodes"
assert
dp_size
%
dp_num_
nodes
==
0
,
"dp_size should be divisible by
dp_num_
nodes"
dp_per_node
=
dp_size
//
n
nodes
dp_per_node
=
dp_size
//
dp_num_
nodes
from
multiprocessing
import
Process
from
multiprocessing
import
Process
...
@@ -162,7 +186,7 @@ if __name__ == "__main__":
...
@@ -162,7 +186,7 @@ if __name__ == "__main__":
procs
=
[]
procs
=
[]
for
local_dp_rank
,
global_dp_rank
in
enumerate
(
for
local_dp_rank
,
global_dp_rank
in
enumerate
(
range
(
node_rank
*
dp_per_node
,
(
node_rank
+
1
)
*
dp_per_node
)
range
(
dp_
node_rank
*
dp_per_node
,
(
dp_
node_rank
+
1
)
*
dp_per_node
)
):
):
proc
=
Process
(
proc
=
Process
(
target
=
main
,
target
=
main
,
...
@@ -171,7 +195,7 @@ if __name__ == "__main__":
...
@@ -171,7 +195,7 @@ if __name__ == "__main__":
local_dp_rank
,
local_dp_rank
,
global_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_ip
,
dp_master_port
,
dp_master_port
_val
,
engine_args
,
engine_args
,
),
),
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment