Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8d75f22e
Commit
8d75f22e
authored
Dec 13, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori
parents
ce888aa4
7d80c73d
Changes
706
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1178 additions
and
12 deletions
+1178
-12
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+40
-12
.buildkite/test_areas/attention.yaml
.buildkite/test_areas/attention.yaml
+21
-0
.buildkite/test_areas/basic_correctness.yaml
.buildkite/test_areas/basic_correctness.yaml
+16
-0
.buildkite/test_areas/benchmarks.yaml
.buildkite/test_areas/benchmarks.yaml
+19
-0
.buildkite/test_areas/compile.yaml
.buildkite/test_areas/compile.yaml
+57
-0
.buildkite/test_areas/cuda.yaml
.buildkite/test_areas/cuda.yaml
+22
-0
.buildkite/test_areas/distributed.yaml
.buildkite/test_areas/distributed.yaml
+199
-0
.buildkite/test_areas/e2e_integration.yaml
.buildkite/test_areas/e2e_integration.yaml
+59
-0
.buildkite/test_areas/engine.yaml
.buildkite/test_areas/engine.yaml
+26
-0
.buildkite/test_areas/entrypoints.yaml
.buildkite/test_areas/entrypoints.yaml
+68
-0
.buildkite/test_areas/expert_parallelism.yaml
.buildkite/test_areas/expert_parallelism.yaml
+23
-0
.buildkite/test_areas/kernels.yaml
.buildkite/test_areas/kernels.yaml
+117
-0
.buildkite/test_areas/lm_eval.yaml
.buildkite/test_areas/lm_eval.yaml
+46
-0
.buildkite/test_areas/lora.yaml
.buildkite/test_areas/lora.yaml
+31
-0
.buildkite/test_areas/misc.yaml
.buildkite/test_areas/misc.yaml
+163
-0
.buildkite/test_areas/model_executor.yaml
.buildkite/test_areas/model_executor.yaml
+17
-0
.buildkite/test_areas/models_basic.yaml
.buildkite/test_areas/models_basic.yaml
+62
-0
.buildkite/test_areas/models_distributed.yaml
.buildkite/test_areas/models_distributed.yaml
+22
-0
.buildkite/test_areas/models_language.yaml
.buildkite/test_areas/models_language.yaml
+91
-0
.buildkite/test_areas/models_multimodal.yaml
.buildkite/test_areas/models_multimodal.yaml
+79
-0
No files found.
.buildkite/test-pipeline.yaml
View file @
8d75f22e
...
@@ -350,7 +350,8 @@ steps:
...
@@ -350,7 +350,8 @@ steps:
timeout_in_minutes
:
25
timeout_in_minutes
:
25
gpu
:
h100
gpu
:
h100
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/v1/attention
-
vllm/model_executor/layers
-
tests/v1/determinism/
-
tests/v1/determinism/
commands
:
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
...
@@ -387,23 +388,28 @@ steps:
...
@@ -387,23 +388,28 @@ steps:
working_dir
:
"
/vllm-workspace/examples"
working_dir
:
"
/vllm-workspace/examples"
source_file_dependencies
:
source_file_dependencies
:
-
vllm/entrypoints
-
vllm/entrypoints
-
vllm/multimodal
-
examples/
-
examples/
commands
:
commands
:
-
pip install tensorizer
# for tensorizer test
-
pip install tensorizer
# for tensorizer test
# for basic
-
python3 offline_inference/basic/chat.py
-
python3 offline_inference/basic/generate.py --model facebook/opt-125m
-
python3 offline_inference/basic/generate.py --model facebook/opt-125m
-
python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb
10
-
python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb
10
-
python3 offline_inference/basic/chat.py
-
python3 offline_inference/basic/classify.py
-
python3 offline_inference/prefix_caching.py
-
python3 offline_inference/basic/embed.py
-
python3 offline_inference/llm_engine_example.py
-
python3 offline_inference/basic/score.py
# for multi-modal models
-
python3 offline_inference/audio_language.py --seed
0
-
python3 offline_inference/audio_language.py --seed
0
-
python3 offline_inference/vision_language.py --seed
0
-
python3 offline_inference/vision_language.py --seed
0
-
python3 offline_inference/vision_language_pooling.py --seed
0
-
python3 offline_inference/vision_language_multi_image.py --seed
0
-
python3 offline_inference/vision_language_multi_image.py --seed
0
-
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed
0
-
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed
0
-
python3 offline_inference/basic/classify.py
# for pooling models
-
python3 offline_inference/basic/embed.py
-
python3 pooling/pooling/vision_language_pooling.py --seed
0
-
python3 offline_inference/basic/score.py
# for features demo
-
python3 offline_inference/prefix_caching.py
-
python3 offline_inference/llm_engine_example.py
-
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
2048
-
python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-
python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
1536
-
python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
1536
...
@@ -462,7 +468,9 @@ steps:
...
@@ -462,7 +468,9 @@ steps:
# tests covered elsewhere.
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-
"
find
compile/
-maxdepth
1
-name
'test_*.py'
-exec
pytest
-s
-v
{}
\\\\
;"
# However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
-
"
find
compile/
-maxdepth
1
-name
'test_*.py'
-print0
|
xargs
-0
-n1
-I{}
pytest
-s
-v
'{}'"
-
label
:
PyTorch Fullgraph Smoke Test
# 15min
-
label
:
PyTorch Fullgraph Smoke Test
# 15min
timeout_in_minutes
:
30
timeout_in_minutes
:
30
...
@@ -476,7 +484,9 @@ steps:
...
@@ -476,7 +484,9 @@ steps:
# as it is a heavy test that is covered in other steps.
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-
"
find
compile/fullgraph/
-name
'test_*.py'
-not
-name
'test_full_graph.py'
-exec
pytest
-s
-v
{}
\\\\
;"
# However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
-
"
find
compile/fullgraph
-maxdepth
1
-name
'test_*.py'
-not
-name
'test_full_graph.py'
-print0
|
xargs
-0
-n1
-I{}
pytest
-s
-v
'{}'"
-
label
:
PyTorch Fullgraph Test
# 27min
-
label
:
PyTorch Fullgraph Test
# 27min
timeout_in_minutes
:
40
timeout_in_minutes
:
40
...
@@ -1369,4 +1379,22 @@ steps:
...
@@ -1369,4 +1379,22 @@ steps:
num_gpus
:
2
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
working_dir
:
"
/vllm-workspace"
commands
:
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2
1
\ No newline at end of file
-
label
:
DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319
8030
-
label
:
Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319
8040
.buildkite/test_areas/attention.yaml
0 → 100644
View file @
8d75f22e
group
:
Attention
depends_on
:
-
image-build
steps
:
-
label
:
V1 attention (H100)
timeout_in_minutes
:
30
gpu
:
h100
source_file_dependencies
:
-
vllm/v1/attention
-
tests/v1/attention
commands
:
-
pytest -v -s v1/attention
-
label
:
V1 attention (B200)
timeout_in_minutes
:
30
gpu
:
b200
source_file_dependencies
:
-
vllm/v1/attention
-
tests/v1/attention
commands
:
-
VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention
# TODO: FI prefill is bugged and causes incorrectness, fix this
.buildkite/test_areas/basic_correctness.yaml
0 → 100644
View file @
8d75f22e
group
:
Basic Correctness
depends_on
:
-
image-build
steps
:
-
label
:
Basic Correctness
timeout_in_minutes
:
30
source_file_dependencies
:
-
vllm/
-
tests/basic_correctness/test_basic_correctness
-
tests/basic_correctness/test_cpu_offload
-
tests/basic_correctness/test_cumem.py
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s basic_correctness/test_cumem.py
-
pytest -v -s basic_correctness/test_basic_correctness.py
-
pytest -v -s basic_correctness/test_cpu_offload.py
.buildkite/test_areas/benchmarks.yaml
0 → 100644
View file @
8d75f22e
group
:
Benchmarks
depends_on
:
-
image-build
steps
:
-
label
:
Benchmarks
timeout_in_minutes
:
20
working_dir
:
"
/vllm-workspace/.buildkite"
source_file_dependencies
:
-
benchmarks/
commands
:
-
bash scripts/run-benchmarks.sh
-
label
:
Benchmarks CLI Test
timeout_in_minutes
:
20
source_file_dependencies
:
-
vllm/
-
tests/benchmarks/
commands
:
-
pytest -v -s benchmarks/
.buildkite/test_areas/compile.yaml
0 → 100644
View file @
8d75f22e
group
:
Compile
depends_on
:
-
image-build
steps
:
-
label
:
Fusion and Compile Tests (B200)
timeout_in_minutes
:
40
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
source_file_dependencies
:
-
csrc/quantization/fp4/
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/worker/
-
vllm/v1/cudagraph_dispatcher.py
-
vllm/compilation/
# can affect pattern matching
-
vllm/model_executor/layers/layernorm.py
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
tests/compile/test_fusion_attn.py
-
tests/compile/test_silu_mul_quant_fusion.py
-
tests/compile/distributed/test_fusion_all_reduce.py
-
tests/compile/distributed/test_fusions_e2e.py
-
tests/compile/fullgraph/test_full_graph.py
commands
:
-
nvidia-smi
-
pytest -v -s tests/compile/test_fusion_attn.py
-
pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
-
pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
-
"
pytest
-v
-s
tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-k
'True
and
not
+quant_fp8
and
not
+rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-
pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-
label
:
Fusion E2E (2 GPUs)(B200)
timeout_in_minutes
:
40
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
optional
:
true
num_gpus
:
2
source_file_dependencies
:
-
csrc/quantization/fp4/
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/compilation/
# can affect pattern matching
-
vllm/model_executor/layers/layernorm.py
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
tests/compile/distributed/test_fusions_e2e.py
commands
:
-
nvidia-smi
# Run all e2e fusion tests
-
pytest -v -s tests/compile/distributed/test_fusions_e2e.py
.buildkite/test_areas/cuda.yaml
0 → 100644
View file @
8d75f22e
group
:
CUDA
depends_on
:
-
image-build
steps
:
-
label
:
Platform Tests (CUDA)
timeout_in_minutes
:
15
source_file_dependencies
:
-
vllm/
-
tests/cuda
commands
:
-
pytest -v -s cuda/test_cuda_context.py
-
label
:
Cudagraph
timeout_in_minutes
:
20
source_file_dependencies
:
-
tests/v1/cudagraph
-
vllm/v1/cudagraph_dispatcher.py
-
vllm/config/compilation.py
-
vllm/compilation
commands
:
-
pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
-
pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
.buildkite/test_areas/distributed.yaml
0 → 100644
View file @
8d75f22e
group
:
Distributed
depends_on
:
-
image-build
steps
:
-
label
:
Distributed Comm Ops
timeout_in_minutes
:
20
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
source_file_dependencies
:
-
vllm/distributed
-
tests/distributed
commands
:
-
pytest -v -s distributed/test_comm_ops.py
-
pytest -v -s distributed/test_shm_broadcast.py
-
pytest -v -s distributed/test_shm_buffer.py
-
pytest -v -s distributed/test_shm_storage.py
-
label
:
Distributed (2 GPUs)
timeout_in_minutes
:
90
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
source_file_dependencies
:
-
vllm/compilation/
-
vllm/distributed/
-
vllm/engine/
-
vllm/executor/
-
vllm/worker/worker_base.py
-
vllm/v1/engine/
-
vllm/v1/worker/
-
tests/compile/fullgraph/test_basic_correctness.py
-
tests/compile/test_wrapper.py
-
tests/distributed/
-
tests/entrypoints/llm/test_collective_rpc.py
-
tests/v1/distributed
-
tests/v1/entrypoints/openai/test_multi_api_servers.py
-
tests/v1/shutdown
-
tests/v1/worker/test_worker_memory_snapshot.py
commands
:
# https://github.com/NVIDIA/nccl/issues/1838
-
export NCCL_CUMEM_HOST_ENABLE=0
-
TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-
TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-
TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-
DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
-
pytest -v -s entrypoints/llm/test_collective_rpc.py
-
pytest -v -s ./compile/fullgraph/test_basic_correctness.py
-
pytest -v -s ./compile/test_wrapper.py
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-
VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-
pytest -v -s distributed/test_sequence_parallel.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-
pytest -v -s v1/worker/test_worker_memory_snapshot.py
-
label
:
Distributed Tests (4 GPUs)
timeout_in_minutes
:
50
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
source_file_dependencies
:
-
vllm/distributed/
-
tests/distributed/test_utils
-
tests/distributed/test_pynccl
-
tests/distributed/test_events
-
tests/compile/fullgraph/test_basic_correctness.py
-
examples/offline_inference/rlhf.py
-
examples/offline_inference/rlhf_colocate.py
-
tests/examples/offline_inference/data_parallel.py
-
tests/v1/distributed
-
tests/v1/engine/test_engine_core_client.py
-
tests/distributed/test_symm_mem_allreduce.py
commands
:
# https://github.com/NVIDIA/nccl/issues/1838
-
export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2
-
torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
-
PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
-
TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
-
PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
-
DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
-
TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
-
python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-
TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-
TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
-
pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-
pytest -v -s distributed/test_utils.py
-
pytest -v -s compile/fullgraph/test_basic_correctness.py
-
pytest -v -s distributed/test_pynccl.py
-
pytest -v -s distributed/test_events.py
-
pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
-
cd ../examples/offline_inference
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-
VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-
label
:
Distributed Tests (8 GPUs)(H100)
timeout_in_minutes
:
10
gpu
:
h100
num_gpus
:
8
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
examples/offline_inference/torchrun_dp_example.py
-
vllm/config/parallel.py
-
vllm/distributed/
-
vllm/v1/engine/llm_engine.py
-
vllm/v1/executor/uniproc_executor.py
-
vllm/v1/worker/gpu_worker.py
commands
:
# https://github.com/NVIDIA/nccl/issues/1838
-
export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
-
torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-
label
:
Distributed Tests (4 GPUs)(A100)
gpu
:
a100
optional
:
true
num_gpus
:
4
source_file_dependencies
:
-
vllm/
commands
:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
-
pytest -v -s distributed/test_custom_all_reduce.py
-
torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-
pytest -v -s -x lora/test_mixtral.py
-
label
:
Distributed Tests (2 GPUs)(H200)
gpu
:
h200
optional
:
true
working_dir
:
"
/vllm-workspace/"
num_gpus
:
2
commands
:
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-
pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-
pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len
2048
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
label
:
Distributed Tests (2 GPUs)(B200)
gpu
:
b200
optional
:
true
working_dir
:
"
/vllm-workspace/"
num_gpus
:
2
commands
:
-
pytest -v -s tests/distributed/test_context_parallel.py
-
pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
label
:
2 Node Test (4 GPUs)
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
num_nodes
:
2
source_file_dependencies
:
-
vllm/distributed/
-
vllm/engine/
-
vllm/executor/
-
vllm/model_executor/models/
-
tests/distributed/
-
tests/examples/offline_inference/data_parallel.py
commands
:
-
./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
-
label
:
Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
source_file_dependencies
:
-
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-
tests/v1/kv_connector/nixl_integration/
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-
bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
-
label
:
Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
source_file_dependencies
:
-
vllm/distributed/
-
vllm/engine/
-
vllm/executor/
-
vllm/model_executor/models/
-
tests/distributed/
commands
:
-
pytest -v -s distributed/test_pp_cudagraph.py
-
pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
.buildkite/test_areas/e2e_integration.yaml
0 → 100644
View file @
8d75f22e
group
:
E2E Integration
depends_on
:
-
image-build
steps
:
-
label
:
DeepSeek V2-Lite Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200
8010
-
label
:
Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200
8020
-
label
:
Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes
:
60
gpu
:
b200
optional
:
true
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2
1
-
label
:
Prime-RL Integration (2 GPUs)
timeout_in_minutes
:
30
optional
:
true
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
source_file_dependencies
:
-
vllm/
-
.buildkite/scripts/run-prime-rl-test.sh
commands
:
-
bash .buildkite/scripts/run-prime-rl-test.sh
-
label
:
DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319
8030
-
label
:
Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319
8040
.buildkite/test_areas/engine.yaml
0 → 100644
View file @
8d75f22e
group
:
Engine
depends_on
:
-
image-build
steps
:
-
label
:
Engine
timeout_in_minutes
:
15
source_file_dependencies
:
-
vllm/
-
tests/engine
-
tests/test_sequence
-
tests/test_config
-
tests/test_logger
-
tests/test_vllm_port
commands
:
-
pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-
label
:
V1 e2e + engine
timeout_in_minutes
:
45
source_file_dependencies
:
-
vllm/
-
tests/v1
commands
:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-
pytest -v -s v1/e2e
-
pytest -v -s v1/engine
.buildkite/test_areas/entrypoints.yaml
0 → 100644
View file @
8d75f22e
group
:
Entrypoints
depends_on
:
-
image-build
steps
:
-
label
:
Entrypoints Unit Tests
timeout_in_minutes
:
10
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
vllm/entrypoints
-
tests/entrypoints/
commands
:
-
pytest -v -s entrypoints/openai/tool_parsers
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
label
:
Entrypoints Integration (LLM)
timeout_in_minutes
:
40
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/llm
-
tests/entrypoints/offline_mode
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
label
:
Entrypoints Integration (API Server)
timeout_in_minutes
:
130
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/openai
-
tests/entrypoints/test_chat_utils
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
# PYTHONPATH is needed to import custom Worker extension
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/test_chat_utils.py
-
label
:
Entrypoints Integration (Pooling)
timeout_in_minutes
:
50
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/pooling
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/pooling
-
label
:
Entrypoints V1
timeout_in_minutes
:
50
source_file_dependencies
:
-
vllm/
-
tests/v1
commands
:
-
pytest -v -s v1/entrypoints
-
label
:
OpenAI API Correctness
timeout_in_minutes
:
30
source_file_dependencies
:
-
csrc/
-
vllm/entrypoints/openai/
-
vllm/model_executor/models/whisper.py
commands
:
# LMEval+Transcription WER check
-
pytest -s entrypoints/openai/correctness/
.buildkite/test_areas/expert_parallelism.yaml
0 → 100644
View file @
8d75f22e
group
:
Expert Parallelism
depends_on
:
-
image-build
steps
:
-
label
:
EPLB Algorithm
timeout_in_minutes
:
15
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
vllm/distributed/eplb
-
tests/distributed/test_eplb_algo.py
commands
:
-
pytest -v -s distributed/test_eplb_algo.py
-
label
:
EPLB Execution
timeout_in_minutes
:
20
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
source_file_dependencies
:
-
vllm/distributed/eplb
-
tests/distributed/test_eplb_execute.py
commands
:
-
pytest -v -s distributed/test_eplb_execute.py
-
pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
.buildkite/test_areas/kernels.yaml
0 → 100644
View file @
8d75f22e
group
:
Kernels
depends_on
:
-
image-build
steps
:
-
label
:
Kernels Core Operation Test
timeout_in_minutes
:
75
source_file_dependencies
:
-
csrc/
-
tests/kernels/core
-
tests/kernels/test_top_k_per_row.py
commands
:
-
pytest -v -s kernels/core kernels/test_top_k_per_row.py
-
label
:
Kernels Attention Test %N
timeout_in_minutes
:
35
source_file_dependencies
:
-
csrc/attention/
-
vllm/attention
-
vllm/v1/attention
-
tests/kernels/attention
commands
:
-
pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism
:
2
-
label
:
Kernels Quantization Test %N
timeout_in_minutes
:
90
source_file_dependencies
:
-
csrc/quantization/
-
vllm/model_executor/layers/quantization
-
tests/kernels/quantization
commands
:
-
pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism
:
2
-
label
:
Kernels MoE Test %N
timeout_in_minutes
:
60
source_file_dependencies
:
-
csrc/quantization/cutlass_w8a8/moe/
-
csrc/moe/
-
tests/kernels/moe
-
vllm/model_executor/layers/fused_moe/
-
vllm/distributed/device_communicators/
-
vllm/envs.py
-
vllm/config
commands
:
-
pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism
:
2
-
label
:
Kernels Mamba Test
timeout_in_minutes
:
45
source_file_dependencies
:
-
csrc/mamba/
-
tests/kernels/mamba
-
vllm/model_executor/layers/mamba/ops
commands
:
-
pytest -v -s kernels/mamba
-
label
:
Kernels DeepGEMM Test (H100)
timeout_in_minutes
:
45
gpu
:
h100
num_gpus
:
1
source_file_dependencies
:
-
tools/install_deepgemm.sh
-
vllm/utils/deep_gemm.py
-
vllm/model_executor/layers/fused_moe
-
vllm/model_executor/layers/quantization
-
tests/kernels/quantization/test_block_fp8.py
-
tests/kernels/moe/test_deepgemm.py
-
tests/kernels/moe/test_batched_deepgemm.py
-
tests/kernels/attention/test_deepgemm_attention.py
commands
:
-
pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
-
pytest -v -s kernels/moe/test_deepgemm.py
-
pytest -v -s kernels/moe/test_batched_deepgemm.py
-
pytest -v -s kernels/attention/test_deepgemm_attention.py
-
label
:
Kernels (B200)
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
# optional: true
source_file_dependencies
:
-
csrc/quantization/fp4/
-
csrc/attention/mla/
-
csrc/quantization/cutlass_w8a8/moe/
-
vllm/model_executor/layers/fused_moe/cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
-
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
-
vllm/v1/attention/backends/mla/flashinfer_mla.py
-
vllm/platforms/cuda.py
-
vllm/attention/selector.py
commands
:
-
nvidia-smi
-
python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
-
pytest -v -s tests/kernels/attention/test_attention_selector.py
-
pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
-
pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-
pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
-
pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
-
pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
-
pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-
pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
-
pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
-
pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
-
pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-
pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
-
pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
-
pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-
pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-
pytest -v -s tests/kernels/moe/test_flashinfer.py
-
pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
.buildkite/test_areas/lm_eval.yaml
0 → 100644
View file @
8d75f22e
group
:
LM Eval
depends_on
:
-
image-build
steps
:
-
label
:
LM Eval Small Models
timeout_in_minutes
:
75
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
autorun_on_main
:
true
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-
label
:
LM Eval Large Models (4 GPUs)(A100)
gpu
:
a100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
label
:
LM Eval Large Models (4 GPUs)(H100)
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_USE_DEEP_GEMM=0
# We found Triton is faster than DeepGEMM for H100
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
label
:
LM Eval Small Models (B200)
timeout_in_minutes
:
120
gpu
:
b200
optional
:
true
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
.buildkite/test_areas/lora.yaml
0 → 100644
View file @
8d75f22e
group
:
LoRA
depends_on
:
-
image-build
steps
:
-
label
:
LoRA %N
timeout_in_minutes
:
30
source_file_dependencies
:
-
vllm/lora
-
tests/lora
commands
:
-
pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
parallelism
:
4
-
label
:
LoRA TP (Distributed)
timeout_in_minutes
:
30
num_gpus
:
4
source_file_dependencies
:
-
vllm/lora
-
tests/lora
commands
:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
-
pytest -v -s -x lora/test_chatglm3_tp.py
-
pytest -v -s -x lora/test_llama_tp.py
-
pytest -v -s -x lora/test_llm_with_multi_loras.py
-
pytest -v -s -x lora/test_olmoe_tp.py
-
pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
.buildkite/test_areas/misc.yaml
0 → 100644
View file @
8d75f22e
group
:
Miscellaneous
depends_on
:
-
image-build
steps
:
-
label
:
V1 Others
timeout_in_minutes
:
60
source_file_dependencies
:
-
vllm/
-
tests/v1
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
# split the test to avoid interference
-
pytest -v -s -m 'not cpu_test' v1/core
-
pytest -v -s v1/executor
-
pytest -v -s v1/kv_offload
-
pytest -v -s v1/sample
-
pytest -v -s v1/logits_processors
-
pytest -v -s v1/worker
-
pytest -v -s v1/spec_decode
-
pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-
pytest -v -s -m 'not cpu_test' v1/metrics
-
pytest -v -s v1/test_oracle.py
-
pytest -v -s v1/test_request.py
-
pytest -v -s v1/test_outputs.py
# Integration test for streaming correctness (requires special branch).
-
pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
-
pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-
label
:
V1 Others (CPU)
depends_on
:
~
source_file_dependencies
:
-
vllm/
-
tests/v1
no_gpu
:
true
commands
:
# split the test to avoid interference
-
pytest -v -s -m 'cpu_test' v1/core
-
pytest -v -s v1/structured_output
-
pytest -v -s v1/test_serial_utils.py
-
pytest -v -s -m 'cpu_test' v1/kv_connector/unit
-
pytest -v -s -m 'cpu_test' v1/metrics
-
label
:
Regression
timeout_in_minutes
:
20
source_file_dependencies
:
-
vllm/
-
tests/test_regression
commands
:
-
pip install modelscope
-
pytest -v -s test_regression.py
working_dir
:
"
/vllm-workspace/tests"
# optional
-
label
:
Examples
timeout_in_minutes
:
45
working_dir
:
"
/vllm-workspace/examples"
source_file_dependencies
:
-
vllm/entrypoints
-
vllm/multimodal
-
examples/
commands
:
-
pip install tensorizer
# for tensorizer test
-
python3 offline_inference/basic/chat.py
# for basic
-
python3 offline_inference/basic/generate.py --model facebook/opt-125m
-
python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb
10
-
python3 offline_inference/basic/classify.py
-
python3 offline_inference/basic/embed.py
-
python3 offline_inference/basic/score.py
# for multi-modal models
-
python3 offline_inference/audio_language.py --seed
0
-
python3 offline_inference/vision_language.py --seed
0
-
python3 offline_inference/vision_language_multi_image.py --seed
0
-
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed
0
# for pooling models
-
python3 pooling/pooling/vision_language_pooling.py --seed
0
# for features demo
-
python3 offline_inference/prefix_caching.py
-
python3 offline_inference/llm_engine_example.py
-
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-
python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
1536
-
label
:
Metrics, Tracing (2 GPUs)
timeout_in_minutes
:
20
num_gpus
:
2
source_file_dependencies
:
-
vllm/
-
tests/v1/tracing
commands
:
-
"
pip
install
\
'opentelemetry-sdk>=1.26.0'
\
'opentelemetry-api>=1.26.0'
\
'opentelemetry-exporter-otlp>=1.26.0'
\
'opentelemetry-semantic-conventions-ai>=0.4.1'"
-
pytest -v -s v1/tracing
-
label
:
Python-only Installation
depends_on
:
~
timeout_in_minutes
:
20
source_file_dependencies
:
-
tests/standalone_tests/python_only_compile.sh
-
setup.py
commands
:
-
bash standalone_tests/python_only_compile.sh
-
label
:
Async Engine, Inputs, Utils, Worker
timeout_in_minutes
:
50
source_file_dependencies
:
-
vllm/
-
tests/multimodal
-
tests/utils_
commands
:
-
pytest -v -s -m 'not cpu_test' multimodal
-
pytest -v -s utils_
-
label
:
Async Engine, Inputs, Utils, Worker, Config (CPU)
depends_on
:
~
timeout_in_minutes
:
20
source_file_dependencies
:
-
vllm/
-
tests/test_inputs.py
-
tests/test_outputs.py
-
tests/multimodal
-
tests/standalone_tests/lazy_imports.py
-
tests/tokenizers_
-
tests/transformers_utils
-
tests/config
no_gpu
:
true
commands
:
-
python3 standalone_tests/lazy_imports.py
-
pytest -v -s test_inputs.py
-
pytest -v -s test_outputs.py
-
pytest -v -s -m 'cpu_test' multimodal
-
pytest -v -s tokenizers_
-
pytest -v -s transformers_utils
-
pytest -v -s config
-
label
:
GPT-OSS Eval (B200)
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
optional
:
true
source_file_dependencies
:
-
tests/evals/gpt_oss
-
vllm/model_executor/models/gpt_oss.py
-
vllm/model_executor/layers/quantization/mxfp4.py
-
vllm/v1/attention/backends/flashinfer.py
commands
:
-
uv pip install --system 'gpt-oss[eval]==0.0.5'
-
pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric
0.58
-
label
:
Batch Invariance (H100)
timeout_in_minutes
:
25
gpu
:
h100
source_file_dependencies
:
-
vllm/v1/attention
-
vllm/model_executor/layers
-
tests/v1/determinism/
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pip install pytest-timeout pytest-forked
-
pytest -v -s v1/determinism/test_batch_invariance.py
-
pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file
.buildkite/test_areas/model_executor.yaml
0 → 100644
View file @
8d75f22e
group
:
Model Executor
depends_on
:
-
image-build
steps
:
-
label
:
Model Executor
timeout_in_minutes
:
35
source_file_dependencies
:
-
vllm/engine/arg_utils.py
-
vllm/config/model.py
-
vllm/model_executor
-
tests/model_executor
-
tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands
:
-
apt-get update && apt-get install -y curl libsodium23
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s model_executor
-
pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
.buildkite/test_areas/models_basic.yaml
0 → 100644
View file @
8d75f22e
group
:
Models - Basic
depends_on
:
-
image-build
steps
:
-
label
:
Basic Models Tests (Initialization)
timeout_in_minutes
:
45
mirror_hardwares
:
[
amdexperimental
]
torch_nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/test_initialization.py
commands
:
# Run a subset of model initialization tests
-
pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
-
label
:
Basic Models Tests (Extra Initialization) %N
timeout_in_minutes
:
45
mirror_hardwares
:
[
amdexperimental
]
torch_nightly
:
true
source_file_dependencies
:
-
vllm/model_executor/models/
-
tests/models/test_initialization.py
commands
:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
# test.) Also run if model initialization test file is modified
-
pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism
:
2
-
label
:
Basic Models Tests (Other)
timeout_in_minutes
:
45
source_file_dependencies
:
-
vllm/
-
tests/models/test_transformers.py
-
tests/models/test_registry.py
commands
:
-
pytest -v -s models/test_transformers.py models/test_registry.py
-
label
:
Basic Models Test (Other CPU)
# 5min
timeout_in_minutes
:
10
source_file_dependencies
:
-
vllm/
-
tests/models/test_utils.py
-
tests/models/test_vision.py
no_gpu
:
true
commands
:
-
pytest -v -s models/test_utils.py models/test_vision.py
-
label
:
Transformers Nightly Models
working_dir
:
"
/vllm-workspace/"
optional
:
true
soft_fail
:
true
commands
:
-
pip install --upgrade git+https://github.com/huggingface/transformers
-
pytest -v -s tests/models/test_initialization.py
-
pytest -v -s tests/models/test_transformers.py
-
pytest -v -s tests/models/multimodal/processing/
-
pytest -v -s tests/models/multimodal/test_mapping.py
-
python3 examples/offline_inference/basic/chat.py
-
python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
-
VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
.buildkite/test_areas/models_distributed.yaml
0 → 100644
View file @
8d75f22e
group
:
Models - Distributed
depends_on
:
-
image-build
steps
:
-
label
:
Distributed Model Tests (2 GPUs)
timeout_in_minutes
:
50
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
source_file_dependencies
:
-
vllm/model_executor/model_loader/sharded_state_loader.py
-
vllm/model_executor/models/
-
tests/basic_correctness/
-
tests/model_executor/model_loader/test_sharded_state_loader.py
-
tests/models/
commands
:
-
TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error
-
pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-
pytest models/language -v -s -m 'distributed(num_gpus=2)'
-
pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
-
VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
.buildkite/test_areas/models_language.yaml
0 → 100644
View file @
8d75f22e
group
:
Models - Language
depends_on
:
-
image-build
steps
:
-
label
:
Language Models Tests (Standard)
timeout_in_minutes
:
25
mirror_hardwares
:
[
amdexperimental
]
torch_nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/language
commands
:
# Test standard language models, excluding a subset of slow tests
-
pip freeze | grep -E 'torch'
-
pytest -v -s models/language -m 'core_model and (not slow_test)'
-
label
:
Language Models Tests (Extra Standard) %N
timeout_in_minutes
:
45
mirror_hardwares
:
[
amdexperimental
]
torch_nightly
:
true
source_file_dependencies
:
-
vllm/model_executor/models/
-
tests/models/language/pooling/test_embedding.py
-
tests/models/language/generation/test_common.py
-
tests/models/language/pooling/test_classification.py
commands
:
# Shard slow subset of standard language models tests. Only run when model
# source is modified, or when specified test files are modified
-
pip freeze | grep -E 'torch'
-
pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism
:
2
-
label
:
Language Models Tests (Hybrid) %N
timeout_in_minutes
:
75
mirror_hardwares
:
[
amdexperimental
]
torch_nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/language/generation
commands
:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
-
uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
-
uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests
-
pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism
:
2
-
label
:
Language Models Test (Extended Generation)
# 80min
timeout_in_minutes
:
110
mirror_hardwares
:
[
amdexperimental
]
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/language/generation
commands
:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
-
uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
-
uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-
pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-
label
:
Language Models Test (PPL)
timeout_in_minutes
:
110
mirror_hardwares
:
[
amdexperimental
]
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/language/generation_ppl_test
commands
:
-
pytest -v -s models/language/generation_ppl_test
-
label
:
Language Models Test (Extended Pooling)
# 36min
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/language/pooling
commands
:
-
pytest -v -s models/language/pooling -m 'not core_model'
-
label
:
Language Models Test (MTEB)
timeout_in_minutes
:
110
mirror_hardwares
:
[
amdexperimental
]
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/language/pooling_mteb_test
commands
:
-
pytest -v -s models/language/pooling_mteb_test
.buildkite/test_areas/models_multimodal.yaml
0 → 100644
View file @
8d75f22e
group
:
Models - Multimodal
depends_on
:
-
image-build
steps
:
-
label
:
Multi-Modal Models (Standard)
# 60min
timeout_in_minutes
:
80
source_file_dependencies
:
-
vllm/
-
tests/models/multimodal
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pip freeze | grep -E 'torch'
-
pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
-
cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model
# Otherwise, mp_method="spawn" doesn't work
-
label
:
Multi-Modal Processor Test (CPU)
timeout_in_minutes
:
60
source_file_dependencies
:
-
vllm/
-
tests/models/multimodal
no_gpu
:
true
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-
label
:
Multi-Modal Processor
# 44min
timeout_in_minutes
:
60
source_file_dependencies
:
-
vllm/
-
tests/models/multimodal
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal/processing/test_tensor_schema.py
-
label
:
Multi-Modal Accuracy Eval (Small Models)
# 50min
timeout_in_minutes
:
70
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
vllm/multimodal/
-
vllm/inputs/
-
vllm/v1/core/
commands
:
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
label
:
Multi-Modal Models (Extended)
1
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/multimodal
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
-
label
:
Multi-Modal Models (Extended)
2
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/multimodal
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
-
label
:
Multi-Modal Models (Extended)
3
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/multimodal
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
# This test is used only in PR development phase to test individual models and should never run on main
-
label
:
Custom Models
optional
:
true
commands
:
-
echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
Prev
1
2
3
4
5
6
…
36
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment