Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ae0770fa
Unverified
Commit
ae0770fa
authored
Dec 20, 2025
by
Lucas Wilkinson
Committed by
GitHub
Dec 20, 2025
Browse files
[CI] Fix H200 Distributed test (#31054)
Signed-off-by:
Lucas Wilkinson
<
lwilkins@redhat.com
>
parent
ee52d990
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
51 additions
and
129 deletions
+51
-129
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+3
-3
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+3
-3
.buildkite/test_areas/distributed.yaml
.buildkite/test_areas/distributed.yaml
+2
-2
examples/offline_inference/data_parallel.py
examples/offline_inference/data_parallel.py
+43
-121
No files found.
.buildkite/test-amd.yaml
View file @
ae0770fa
...
...
@@ -1254,13 +1254,13 @@ steps:
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size
=1 --node
-size
=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
node
s
=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size
=1 --node
-size
=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
node
s
=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
label
:
Distributed Tests (2 GPUs)
# 68min
timeout_in_minutes
:
90
...
...
@@ -1508,7 +1508,7 @@ steps:
-
"
VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest
-v
-s
tests/compile/distributed/test_fusions_e2e.py
-k
'not
Llama-4'"
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
Qwen/Qwen1.5-MoE-A2.7B
-
-tp
-size
=1
-
-dp
-size
=2 --max-model-len
2048 --all2all-backend
deepep_high_throughput
-
HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
=
Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len
=
2048 --all2all-backend
=
deepep_high_throughput
-
pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
...
...
.buildkite/test-pipeline.yaml
View file @
ae0770fa
...
...
@@ -1109,13 +1109,13 @@ steps:
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size
=1 --node
-size
=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
node
s
=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size
=1 --node
-size
=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
node
s
=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
label
:
Distributed Tests (2 GPUs)
# 68min
timeout_in_minutes
:
90
...
...
@@ -1334,7 +1334,7 @@ steps:
-
"
VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest
-v
-s
tests/compile/distributed/test_fusions_e2e.py
-k
'not
Llama-4'"
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
Qwen/Qwen1.5-MoE-A2.7B
-
-tp
-size
=1
-
-dp
-size
=2 --max-model-len
2048 --all2all-backend
deepep_high_throughput
-
CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
=
Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len
=
2048 --all2all-backend
=
deepep_high_throughput
-
pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
...
...
.buildkite/test_areas/distributed.yaml
View file @
ae0770fa
...
...
@@ -145,7 +145,7 @@ steps:
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
Qwen/Qwen1.5-MoE-A2.7B
-
-tp
-size
=1
-
-dp
-size
=2 --max-model-len
2048 --all2all-backend
deepep_high_throughput
-
CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
=
Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len
=
2048 --all2all-backend
=
deepep_high_throughput
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
label
:
Distributed Tests (2 GPUs)(B200)
...
...
@@ -171,7 +171,7 @@ steps:
-
tests/distributed/
-
tests/examples/offline_inference/data_parallel.py
commands
:
-
./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size
=1 --node
-size
=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size
=1 --node
-size
=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
-
./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
node
s
=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --
n
node
s
=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
-
label
:
Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes
:
30
...
...
examples/offline_inference/data_parallel.py
View file @
ae0770fa
...
...
@@ -5,25 +5,25 @@ Usage:
Single node:
python examples/offline_inference/data_parallel.py
\
--model="ibm-research/PowerMoE-3b"
\
-
-dp
-size
=2
\
-
-tp
-size
=2
-dp=2
\
-tp=2
Multi-node:
Node 0 (assume the node has ip of 10.99.48.128):
python examples/offline_inference/data_parallel.py
\
--model="ibm-research/PowerMoE-3b"
\
-
-dp
-size
=2
\
-
-tp
-size
=2
\
--node
-size
=2
\
-dp=2
\
-tp=2
\
--
n
node
s
=2
\
--node-rank=0
\
--master-addr=10.99.48.128
\
--master-port=13345
Node 1:
python examples/offline_inference/data_parallel.py
\
--model="ibm-research/PowerMoE-3b"
\
-
-dp
-size
=2
\
-
-tp
-size
=2
\
--node
-size
=2
\
-dp=2
\
-tp=2
\
--
n
node
s
=2
\
--node-rank=1
\
--master-addr=10.99.48.128
\
--master-port=13345
...
...
@@ -32,103 +32,40 @@ Multi-node:
import
os
from
time
import
sleep
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
EngineArgs
,
SamplingParams
from
vllm.platforms
import
current_platform
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
from
vllm.utils.network_utils
import
get_open_port
def
parse_args
():
import
argparse
def
create_parser
():
parser
=
FlexibleArgumentParser
(
description
=
"Data Parallel Inference"
)
parser
=
argparse
.
ArgumentParser
(
description
=
"Data Parallel Inference"
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"ibm-research/PowerMoE-3b"
,
help
=
"Model name or path"
,
)
parser
.
add_argument
(
"--dp-size"
,
type
=
int
,
default
=
2
,
help
=
"Data parallel size"
)
parser
.
add_argument
(
"--tp-size"
,
type
=
int
,
default
=
2
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"--node-size"
,
type
=
int
,
default
=
1
,
help
=
"Total number of nodes"
)
parser
.
add_argument
(
"--node-rank"
,
type
=
int
,
default
=
0
,
help
=
"Rank of the current node"
)
parser
.
add_argument
(
"--master-addr"
,
type
=
str
,
default
=
""
,
help
=
"Master node IP address"
)
parser
.
add_argument
(
"--master-port"
,
type
=
int
,
default
=
0
,
help
=
"Master node port"
)
parser
.
add_argument
(
"--enforce-eager"
,
action
=
"store_true"
,
help
=
"Enforce eager mode execution."
)
parser
.
add_argument
(
"--trust-remote-code"
,
action
=
"store_true"
,
help
=
"Trust remote code."
)
parser
.
add_argument
(
"--max-num-seqs"
,
type
=
int
,
default
=
64
,
help
=
(
"Maximum number of sequences to be processed in a single iteration."
),
)
parser
.
add_argument
(
"--max-model-len"
,
type
=
int
,
help
=
(
"Maximum number of tokens to be processed in a single iteration."
),
# Add all engine args
EngineArgs
.
add_cli_args
(
parser
)
parser
.
set_defaults
(
model
=
"ibm-research/PowerMoE-3b"
,
enable_expert_parallel
=
True
,
)
# Add timeout (not in EngineArgs)
parser
.
add_argument
(
"--timeout"
,
type
=
int
,
default
=
300
,
help
=
(
"Number of seconds before unresponsive process is killed."
),
)
parser
.
add_argument
(
"--gpu-memory-utilization"
,
type
=
float
,
default
=
0.8
,
help
=
(
"Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."
),
)
parser
.
add_argument
(
"--enable-dbo"
,
action
=
"store_true"
,
help
=
(
"Enable microbatched execution"
),
)
parser
.
add_argument
(
"--compilation-config"
,
type
=
int
,
help
=
(
"Compilation optimization (O) mode 0-3."
),
)
parser
.
add_argument
(
"--quantization"
,
type
=
str
,
)
parser
.
add_argument
(
"--disable-expert-parallel"
,
dest
=
"enable_expert_parallel"
,
action
=
"store_false"
,
help
=
"Disable expert parallel (default: enabled)."
,
help
=
"Number of seconds before unresponsive process is killed."
,
)
parser
.
set_defaults
(
enable_expert_parallel
=
True
)
return
parser
.
parse_args
()
return
parser
def
main
(
model
,
dp_size
,
local_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_port
,
GPUs_per_dp_rank
,
enforce_eager
,
enable_expert_parallel
,
trust_remote_code
,
max_num_seqs
,
max_model_len
,
compilation_config
,
gpu_memory_utilization
,
enable_dbo
,
quantization
,
engine_args
,
):
os
.
environ
[
"VLLM_DP_RANK"
]
=
str
(
global_dp_rank
)
os
.
environ
[
"VLLM_DP_RANK_LOCAL"
]
=
str
(
local_dp_rank
)
...
...
@@ -173,19 +110,7 @@ def main(
)
# Create an LLM.
llm
=
LLM
(
model
=
model
,
tensor_parallel_size
=
GPUs_per_dp_rank
,
enforce_eager
=
enforce_eager
,
enable_expert_parallel
=
enable_expert_parallel
,
trust_remote_code
=
trust_remote_code
,
max_num_seqs
=
max_num_seqs
,
max_model_len
=
max_model_len
,
gpu_memory_utilization
=
gpu_memory_utilization
,
enable_dbo
=
enable_dbo
,
quantization
=
quantization
,
compilation_config
=
compilation_config
,
)
llm
=
LLM
(
**
engine_args
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
i
,
output
in
enumerate
(
outputs
):
...
...
@@ -204,22 +129,29 @@ def main(
if
__name__
==
"__main__"
:
args
=
parse_args
()
parser
=
create_parser
()
args
=
vars
(
parser
.
parse_args
())
# Extract DP-specific args
dp_size
=
args
.
pop
(
"data_parallel_size"
)
nnodes
=
args
.
get
(
"nnodes"
,
1
)
node_rank
=
args
.
get
(
"node_rank"
,
0
)
master_addr
=
args
.
get
(
"master_addr"
,
""
)
master_port
=
args
.
get
(
"master_port"
,
0
)
timeout
=
args
.
pop
(
"timeout"
)
dp_size
=
args
.
dp_size
tp_size
=
args
.
tp_size
node_size
=
args
.
node_size
node_rank
=
args
.
node_rank
# Remaining args are engine args
engine_args
=
args
if
node
_size
==
1
:
if
n
node
s
==
1
:
dp_master_ip
=
"127.0.0.1"
dp_master_port
=
get_open_port
()
else
:
dp_master_ip
=
args
.
master_addr
dp_master_port
=
args
.
master_port
dp_master_ip
=
master_addr
dp_master_port
=
master_port
assert
dp_size
%
node
_size
==
0
,
"dp_size should be divisible by node
_size
"
dp_per_node
=
dp_size
//
node
_size
assert
dp_size
%
n
node
s
==
0
,
"dp_size should be divisible by
n
node
s
"
dp_per_node
=
dp_size
//
n
node
s
from
multiprocessing
import
Process
...
...
@@ -235,29 +167,19 @@ if __name__ == "__main__":
proc
=
Process
(
target
=
main
,
args
=
(
args
.
model
,
dp_size
,
local_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_port
,
tp_size
,
args
.
enforce_eager
,
args
.
enable_expert_parallel
,
args
.
trust_remote_code
,
args
.
max_num_seqs
,
args
.
max_model_len
,
args
.
compilation_config
,
args
.
gpu_memory_utilization
,
args
.
enable_dbo
,
args
.
quantization
,
engine_args
,
),
)
proc
.
start
()
procs
.
append
(
proc
)
exit_code
=
0
for
proc
in
procs
:
proc
.
join
(
timeout
=
args
.
timeout
)
proc
.
join
(
timeout
=
timeout
)
if
proc
.
exitcode
is
None
:
print
(
f
"Killing process
{
proc
.
pid
}
that didn't stop within 5 minutes."
)
proc
.
kill
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment