Unverified Commit db14f61f authored by Kevin H. Luu's avatar Kevin H. Luu Committed by GitHub
Browse files

[ci] Refactor CI file structure (#29343)

parent 78c75033
name: vllm_ci
job_dirs:
- ".buildkite/test_areas"
- ".buildkite/image_build"
run_all_patterns:
- "docker/Dockerfile"
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/cuda.txt"
- "requirements/build.txt"
- "requirements/test.txt"
- "setup.py"
- "csrc/"
- "cmake/"
run_all_exclude_patterns:
- "docker/Dockerfile."
- "csrc/cpu/"
- "csrc/rocm/"
- "cmake/hipify.py"
- "cmake/cpu_extension.cmake"
registries: public.ecr.aws/q9t5s3a7
repositories:
main: "vllm-ci-postmerge-repo"
premerge: "vllm-ci-test-repo"
#!/bin/bash
set -e
if [[ $# -lt 8 ]]; then
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6
CACHE_FROM=$7
CACHE_TO=$8
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
# docker buildx
docker buildx create --name vllm-builder --driver docker-container --use
docker buildx inspect --bootstrap
docker buildx ls
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
else
merge_base_commit_build_args=""
fi
# build
docker buildx build --file docker/Dockerfile \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg USE_SCCACHE=1 \
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
${merge_base_commit_build_args} \
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
--cache-to type=registry,ref=${CACHE_TO},mode=max \
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
--push \
--target test \
--progress plain .
group: Abuild
steps:
- label: ":docker: Build image"
key: image-build
depends_on: []
commands:
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU image"
key: image-build-cpu
depends_on: []
commands:
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build HPU image"
soft_fail: true
depends_on: []
key: image-build-hpu
commands:
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU arm64 image"
key: cpu-arm64-image-build
depends_on: []
optional: true
commands:
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
--progress plain \
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
group: Attention
depends_on:
- image-build
steps:
- label: V1 attention (H100)
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: V1 attention (B200)
timeout_in_minutes: 30
gpu: b200
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
group: Basic Correctness
depends_on:
- image-build
steps:
- label: Basic Correctness
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
group: Benchmarks
depends_on:
- image-build
steps:
- label: Benchmarks
timeout_in_minutes: 20
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
- benchmarks/
commands:
- bash scripts/run-benchmarks.sh
- label: Benchmarks CLI Test
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
group: Compile
depends_on:
- image-build
steps:
- label: Fusion and Compile Tests (B200)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Fusion E2E (2 GPUs)(B200)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
group: CUDA
depends_on:
- image-build
steps:
- label: Platform Tests (CUDA)
timeout_in_minutes: 15
source_file_dependencies:
- vllm/
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- label: Cudagraph
timeout_in_minutes: 20
source_file_dependencies:
- tests/v1/cudagraph
- vllm/v1/cudagraph_dispatcher.py
- vllm/config/compilation.py
- vllm/compilation
commands:
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
group: Distributed
depends_on:
- image-build
steps:
- label: Distributed Comm Ops
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/distributed
- tests/distributed
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- pytest -v -s distributed/test_shm_buffer.py
- pytest -v -s distributed/test_shm_storage.py
- label: Distributed (2 GPUs)
timeout_in_minutes: 90
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/compile/fullgraph/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py
- tests/v1/distributed
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Tests (4 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/fullgraph/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10
gpu: h100
num_gpus: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
- vllm/config/parallel.py
- vllm/distributed/
- vllm/v1/engine/llm_engine.py
- vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: Distributed Tests (4 GPUs)(A100)
gpu: a100
optional: true
num_gpus: 4
source_file_dependencies:
- vllm/
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: Distributed Tests (2 GPUs)(H200)
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200)
gpu: b200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: 2 Node Test (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
group: E2E Integration
depends_on:
- image-build
steps:
- label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: Prime-RL Integration (2 GPUs)
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
group: Engine
depends_on:
- image-build
steps:
- label: Engine
timeout_in_minutes: 15
source_file_dependencies:
- vllm/
- tests/engine
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: V1 e2e + engine
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/v1
commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- pytest -v -s v1/engine
group: Entrypoints
depends_on:
- image-build
steps:
- label: Entrypoints Unit Tests
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/entrypoints
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration (LLM)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration (API Server)
timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration (Pooling)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/pooling
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
- label: Entrypoints V1
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/entrypoints
- label: OpenAI API Correctness
timeout_in_minutes: 30
source_file_dependencies:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
group: Expert Parallelism
depends_on:
- image-build
steps:
- label: EPLB Algorithm
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_algo.py
commands:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
- pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
group: Kernels
depends_on:
- image-build
steps:
- label: Kernels Core Operation Test
timeout_in_minutes: 75
source_file_dependencies:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N
timeout_in_minutes: 35
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Quantization Test %N
timeout_in_minutes: 90
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test %N
timeout_in_minutes: 60
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/
- vllm/envs.py
- vllm/config
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Mamba Test
timeout_in_minutes: 45
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
- vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
- label: Kernels DeepGEMM Test (H100)
timeout_in_minutes: 45
gpu: h100
num_gpus: 1
source_file_dependencies:
- tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization
- tests/kernels/quantization/test_block_fp8.py
- tests/kernels/moe/test_deepgemm.py
- tests/kernels/moe/test_batched_deepgemm.py
- tests/kernels/attention/test_deepgemm_attention.py
commands:
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
- pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Kernels (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/platforms/cuda.py
- vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
group: LM Eval
depends_on:
- image-build
steps:
- label: LM Eval Small Models
timeout_in_minutes: 75
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: LM Eval Large Models (4 GPUs)(A100)
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(H100)
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
- label: LM Eval Small Models (B200)
timeout_in_minutes: 120
gpu: b200
optional: true
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
group: LoRA
depends_on:
- image-build
steps:
- label: LoRA %N
timeout_in_minutes: 30
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
parallelism: 4
- label: LoRA TP (Distributed)
timeout_in_minutes: 30
num_gpus: 4
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
group: Miscellaneous
depends_on:
- image-build
steps:
- label: V1 Others
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/v1
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
# split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py
- pytest -v -s v1/test_outputs.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Others (CPU)
depends_on: ~
source_file_dependencies:
- vllm/
- tests/v1
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'cpu_test' v1/metrics
- label: Regression
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/test_regression
commands:
- pip install modelscope
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: Examples
timeout_in_minutes: 45
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
- vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference/basic/chat.py # for basic
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- label: Metrics, Tracing (2 GPUs)
timeout_in_minutes: 20
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/v1/tracing
commands:
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s v1/tracing
- label: Python-only Installation
depends_on: ~
timeout_in_minutes: 20
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
- setup.py
commands:
- bash standalone_tests/python_only_compile.sh
- label: Async Engine, Inputs, Utils, Worker
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/multimodal
- tests/utils_
commands:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
depends_on: ~
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/transformers_utils
- tests/config
no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s transformers_utils
- pytest -v -s config
- label: GPT-OSS Eval (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Batch Invariance (H100)
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pip install pytest-timeout pytest-forked
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment