Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
74fe80ee
Unverified
Commit
74fe80ee
authored
Mar 13, 2026
by
Kevin H. Luu
Committed by
GitHub
Mar 14, 2026
Browse files
[CI] Split Distributed Tests (4 GPUs) into 3 parallel jobs (#37015)
Co-authored-by:
Claude Opus 4.6
<
noreply@anthropic.com
>
parent
bcfdadb1
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
40 additions
and
20 deletions
+40
-20
.buildkite/test_areas/distributed.yaml
.buildkite/test_areas/distributed.yaml
+40
-20
No files found.
.buildkite/test_areas/distributed.yaml
View file @
74fe80ee
...
@@ -50,24 +50,18 @@ steps:
...
@@ -50,24 +50,18 @@ steps:
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-
pytest -v -s v1/worker/test_worker_memory_snapshot.py
-
pytest -v -s v1/worker/test_worker_memory_snapshot.py
-
label
:
Distributed T
est
s (4 GPUs)
-
label
:
Distributed T
orchrun + Example
s (4 GPUs)
timeout_in_minutes
:
5
0
timeout_in_minutes
:
3
0
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_devices
:
4
num_devices
:
4
source_file_dependencies
:
source_file_dependencies
:
-
vllm/distributed/
-
vllm/distributed/
-
tests/distributed/test_utils
-
tests/distributed/test_torchrun_example.py
-
tests/distributed/test_pynccl
-
tests/distributed/test_torchrun_example_moe.py
-
tests/distributed/test_events
-
tests/compile/fullgraph/test_basic_correctness.py
-
examples/offline_inference/rlhf.py
-
examples/offline_inference/rlhf.py
-
examples/offline_inference/rlhf_colocate.py
-
examples/offline_inference/rlhf_colocate.py
-
examples/offline_inference/new_weight_syncing/
-
examples/offline_inference/new_weight_syncing/
-
tests/examples/offline_inference/data_parallel.py
-
tests/examples/offline_inference/data_parallel.py
-
tests/v1/distributed
-
tests/v1/engine/test_engine_core_client.py
-
tests/distributed/test_symm_mem_allreduce.py
-
tests/distributed/test_multiproc_executor.py
commands
:
commands
:
# https://github.com/NVIDIA/nccl/issues/1838
# https://github.com/NVIDIA/nccl/issues/1838
-
export NCCL_CUMEM_HOST_ENABLE=0
-
export NCCL_CUMEM_HOST_ENABLE=0
...
@@ -85,6 +79,27 @@ steps:
...
@@ -85,6 +79,27 @@ steps:
-
TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-
TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
# test with internal dp
-
python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-
python3 ../examples/offline_inference/data_parallel.py --enforce-eager
# OLD rlhf examples
-
cd ../examples/offline_inference
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
# NEW rlhf examples
-
cd new_weight_syncing
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
-
label
:
Distributed DP Tests (4 GPUs)
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/tests"
num_devices
:
4
source_file_dependencies
:
-
vllm/distributed/
-
tests/v1/distributed
-
tests/v1/engine/test_engine_core_client.py
-
tests/distributed/test_utils
commands
:
# https://github.com/NVIDIA/nccl/issues/1838
-
export NCCL_CUMEM_HOST_ENABLE=0
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
...
@@ -92,22 +107,27 @@ steps:
...
@@ -92,22 +107,27 @@ steps:
-
TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
-
TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
-
pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-
pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-
pytest -v -s distributed/test_utils.py
-
pytest -v -s distributed/test_utils.py
-
label
:
Distributed Compile + Comm (4 GPUs)
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/tests"
num_devices
:
4
source_file_dependencies
:
-
vllm/distributed/
-
tests/distributed/test_pynccl
-
tests/distributed/test_events
-
tests/compile/fullgraph/test_basic_correctness.py
-
tests/distributed/test_symm_mem_allreduce.py
-
tests/distributed/test_multiproc_executor.py
commands
:
# https://github.com/NVIDIA/nccl/issues/1838
-
export NCCL_CUMEM_HOST_ENABLE=0
-
pytest -v -s compile/fullgraph/test_basic_correctness.py
-
pytest -v -s compile/fullgraph/test_basic_correctness.py
-
pytest -v -s distributed/test_pynccl.py
-
pytest -v -s distributed/test_pynccl.py
-
pytest -v -s distributed/test_events.py
-
pytest -v -s distributed/test_events.py
-
pytest -v -s distributed/test_symm_mem_allreduce.py
-
pytest -v -s distributed/test_symm_mem_allreduce.py
# test multi-node TP with multiproc executor (simulated on single node)
# test multi-node TP with multiproc executor (simulated on single node)
-
pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
-
pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
# OLD rlhf examples
-
cd ../examples/offline_inference
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
# NEW rlhf examples
-
cd new_weight_syncing
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
-
label
:
Distributed Tests (8 GPUs)(H100)
-
label
:
Distributed Tests (8 GPUs)(H100)
timeout_in_minutes
:
10
timeout_in_minutes
:
10
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment