Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
...@@ -16,6 +16,23 @@ RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python" ...@@ -16,6 +16,23 @@ RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
WORK_DIR=$(mktemp -d) WORK_DIR=$(mktemp -d)
trap 'rm -rf "$WORK_DIR"' EXIT trap 'rm -rf "$WORK_DIR"' EXIT
# ── Detect PyTorch index URL ─────────────────────────────────────────────
if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then
ROCM_VER=$(python3 -c "import torch; print(torch.version.hip.rsplit('.', 1)[0])")
CANDIDATE_URL="https://download.pytorch.org/whl/rocm${ROCM_VER}"
if curl -fsSL --head "${CANDIDATE_URL}/" >/dev/null 2>&1; then
TORCH_INDEX_URL="${CANDIDATE_URL}"
else
echo ">>> WARNING: ROCm ${ROCM_VER} wheel index not found at ${CANDIDATE_URL}"
echo ">>> Falling back to default PyPI (resolution may be incomplete)"
TORCH_INDEX_URL=""
fi
else
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129"
fi
echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}"
# Fetch all Ray requirement files used in the LLM depset pipeline # Fetch all Ray requirement files used in the LLM depset pipeline
echo ">>> Fetching Ray requirement files" echo ">>> Fetching Ray requirement files"
RAY_FILES=( RAY_FILES=(
...@@ -116,6 +133,11 @@ echo "============================================================" ...@@ -116,6 +133,11 @@ echo "============================================================"
echo ">>> Resolving: Can Ray generate compatible lock files?" echo ">>> Resolving: Can Ray generate compatible lock files?"
echo "============================================================" echo "============================================================"
EXTRA_INDEX_ARGS=()
if [[ -n "${TORCH_INDEX_URL}" ]]; then
EXTRA_INDEX_ARGS+=(--extra-index-url "${TORCH_INDEX_URL}")
fi
set +e set +e
uv pip compile \ uv pip compile \
"${WORK_DIR}/requirements.txt" \ "${WORK_DIR}/requirements.txt" \
...@@ -126,7 +148,7 @@ uv pip compile \ ...@@ -126,7 +148,7 @@ uv pip compile \
-c "${WORK_DIR}/vllm-constraints.txt" \ -c "${WORK_DIR}/vllm-constraints.txt" \
--python-version 3.12 \ --python-version 3.12 \
--python-platform x86_64-manylinux_2_31 \ --python-platform x86_64-manylinux_2_31 \
--extra-index-url https://download.pytorch.org/whl/cu129 \ "${EXTRA_INDEX_ARGS[@]}" \
--index-strategy unsafe-best-match \ --index-strategy unsafe-best-match \
--unsafe-package setuptools \ --unsafe-package setuptools \
--unsafe-package ray \ --unsafe-package ray \
......
...@@ -333,15 +333,18 @@ apply_rocm_test_overrides() { ...@@ -333,15 +333,18 @@ apply_rocm_test_overrides() {
# --- Entrypoint ignores --- # --- Entrypoint ignores ---
if [[ $cmds == *" entrypoints/openai "* ]]; then if [[ $cmds == *" entrypoints/openai "* ]]; then
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \ cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/chat_completion/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/completion/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \ --ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_models.py \ --ignore=entrypoints/openai/models/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \ --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \ --ignore=entrypoints/openai/chat_completion/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \ --ignore=entrypoints/openai/completion/test_prompt_validation.py "}
--ignore=entrypoints/openai/test_prompt_validation.py "} fi
if [[ $cmds == *" entrypoints/serve"* ]]; then
cmds="${cmds} \
--ignore=entrypoints/serve/lora/test_lora_adapters.py"
fi fi
if [[ $cmds == *" entrypoints/llm "* ]]; then if [[ $cmds == *" entrypoints/llm "* ]]; then
......
...@@ -127,7 +127,7 @@ run_and_track_test() { ...@@ -127,7 +127,7 @@ run_and_track_test() {
# --- Actual Test Execution --- # --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \ run_and_track_test 1 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \ run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \ run_and_track_test 3 "test_lora.py" \
......
...@@ -33,23 +33,22 @@ docker run \ ...@@ -33,23 +33,22 @@ docker run \
bash -c ' bash -c '
set -e set -e
echo $ZE_AFFINITY_MASK echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py
pytest -v -s v1/structured_output pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py -k "not (test_register_kv_caches and FLASH_ATTN and True)"
pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_serial_utils.py
' '
#!/usr/bin/env bash #!/usr/bin/env bash
set -euxo pipefail set -euxo pipefail
# Nightly e2e test for prefetch offloading with a MoE model. # Nightly e2e test for prefetch offloading with a MoE model.
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights # Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
# and validates GSM8K accuracy matches baseline (no offloading). # and validates GSM8K accuracy matches baseline (no offloading).
# #
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
#
# Environment variables:
# ATTENTION_BACKEND - attention backend to use (e.g., FLASH_ATTN,
# ROCM_ATTN, FLASHINFER). If unset, uses vllm default.
THRESHOLD=${1:-0.25} THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319} NUM_Q=${2:-1319}
PORT=${3:-8030} PORT=${3:-8030}
...@@ -22,6 +25,14 @@ wait_for_server() { ...@@ -22,6 +25,14 @@ wait_for_server() {
MODEL="deepseek-ai/DeepSeek-V2-Lite" MODEL="deepseek-ai/DeepSeek-V2-Lite"
# ── Build optional vllm serve flags ─────────────────────────────────────
EXTRA_ARGS=()
if [[ -n "${ATTENTION_BACKEND:-}" ]]; then
echo "Using attention backend: ${ATTENTION_BACKEND}"
EXTRA_ARGS+=(--attention-backend "${ATTENTION_BACKEND}")
fi
cleanup() { cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true kill "${SERVER_PID}" 2>/dev/null || true
...@@ -40,7 +51,8 @@ vllm serve "$MODEL" \ ...@@ -40,7 +51,8 @@ vllm serve "$MODEL" \
--offload-num-in-group 2 \ --offload-num-in-group 2 \
--offload-prefetch-step 1 \ --offload-prefetch-step 1 \
--offload-params w13_weight w2_weight \ --offload-params w13_weight w2_weight \
--port "$PORT" & --port "$PORT" \
${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
SERVER_PID=$! SERVER_PID=$!
wait_for_server "$PORT" wait_for_server "$PORT"
......
This diff is collapsed.
...@@ -59,7 +59,7 @@ steps: ...@@ -59,7 +59,7 @@ steps:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1 - export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -s -v tests/compile/passes/distributed - pytest -s -v tests/compile/passes/distributed
- label: Fusion and Compile Unit Tests (B200) - label: Fusion and Compile Unit Tests (2xB200)
timeout_in_minutes: 20 timeout_in_minutes: 20
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
device: b200 device: b200
......
...@@ -15,36 +15,66 @@ steps: ...@@ -15,36 +15,66 @@ steps:
- pytest -v -s distributed/test_shm_buffer.py - pytest -v -s distributed/test_shm_buffer.py
- pytest -v -s distributed/test_shm_storage.py - pytest -v -s distributed/test_shm_storage.py
- label: Distributed (2 GPUs) - label: Distributed DP Tests (2 GPUs)
timeout_in_minutes: 60 timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 2 num_devices: 2
source_file_dependencies: source_file_dependencies:
- vllm/compilation/
- vllm/distributed/ - vllm/distributed/
- vllm/engine/ - vllm/engine/
- vllm/executor/ - vllm/executor/
- vllm/worker/worker_base.py - vllm/worker/worker_base.py
- vllm/v1/engine/ - vllm/v1/engine/
- vllm/v1/worker/ - vllm/v1/worker/
- tests/compile/fullgraph/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py
- tests/v1/distributed - tests/v1/distributed
- tests/v1/entrypoints/openai/test_multi_api_servers.py - tests/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands: commands:
# https://github.com/NVIDIA/nccl/issues/1838 # https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0 - export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
- label: Distributed Compile + RPC Tests (2 GPUs)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/compile/fullgraph/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/entrypoints/llm/test_collective_rpc.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- label: Distributed Torchrun + Shutdown Tests (2 GPUs)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/distributed/
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
...@@ -52,41 +82,35 @@ steps: ...@@ -52,41 +82,35 @@ steps:
- label: Distributed Torchrun + Examples (4 GPUs) - label: Distributed Torchrun + Examples (4 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace"
num_devices: 4 num_devices: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example.py
- tests/distributed/test_torchrun_example_moe.py - tests/distributed/test_torchrun_example_moe.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- examples/offline_inference/new_weight_syncing/ - examples/rl/
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
commands: commands:
# https://github.com/NVIDIA/nccl/issues/1838 # https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0 - export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2 # test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2 # test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - PP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1 # test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - TP_SIZE=4 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1 # test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep # test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep # test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager - python3 examples/offline_inference/data_parallel.py --enforce-eager
# OLD rlhf examples # rlhf examples
- cd ../examples/offline_inference - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
# NEW rlhf examples
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
- label: Distributed DP Tests (4 GPUs) - label: Distributed DP Tests (4 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30
...@@ -169,7 +193,7 @@ steps: ...@@ -169,7 +193,7 @@ steps:
num_devices: 2 num_devices: 2
commands: commands:
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
......
...@@ -70,3 +70,15 @@ steps: ...@@ -70,3 +70,15 @@ steps:
device: mi325_4 device: mi325_4
depends_on: depends_on:
- image-build-amd - image-build-amd
- label: V1 e2e (4xH100)
timeout_in_minutes: 60
device: h100
num_devices: 4
optional: true
source_file_dependencies:
- vllm/v1/attention/backends/utils.py
- vllm/v1/worker/gpu_model_runner.py
- tests/v1/e2e/test_hybrid_chunked_prefill.py
commands:
- pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py
...@@ -10,7 +10,7 @@ steps: ...@@ -10,7 +10,7 @@ steps:
- tests/entrypoints/ - tests/entrypoints/
commands: commands:
- pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration (LLM) - label: Entrypoints Integration (LLM)
timeout_in_minutes: 40 timeout_in_minutes: 40
...@@ -34,7 +34,7 @@ steps: ...@@ -34,7 +34,7 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
mirror: mirror:
amd: amd:
...@@ -48,11 +48,11 @@ steps: ...@@ -48,11 +48,11 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/entrypoints/rpc - tests/entrypoints/rpc
- tests/entrypoints/instrumentator - tests/entrypoints/serve/instrumentator
- tests/tool_use - tests/tool_use
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/instrumentator - pytest -v -s entrypoints/serve/instrumentator
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s tool_use - pytest -v -s tool_use
...@@ -75,19 +75,6 @@ steps: ...@@ -75,19 +75,6 @@ steps:
commands: commands:
- pytest -v -s entrypoints/openai/responses - pytest -v -s entrypoints/openai/responses
- label: Entrypoints V1
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/entrypoints
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: OpenAI API Correctness - label: OpenAI API Correctness
timeout_in_minutes: 30 timeout_in_minutes: 30
source_file_dependencies: source_file_dependencies:
......
...@@ -24,8 +24,7 @@ steps: ...@@ -24,8 +24,7 @@ steps:
- label: Elastic EP Scaling Test - label: Elastic EP Scaling Test
timeout_in_minutes: 20 timeout_in_minutes: 20
device: b200 device: h100
optional: true
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 4 num_devices: 4
source_file_dependencies: source_file_dependencies:
......
...@@ -35,7 +35,7 @@ steps: ...@@ -35,7 +35,7 @@ steps:
parallelism: 2 parallelism: 2
- label: Kernels MoE Test %N - label: Kernels MoE Test %N
timeout_in_minutes: 60 timeout_in_minutes: 25
source_file_dependencies: source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/ - csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/ - csrc/moe/
...@@ -47,7 +47,7 @@ steps: ...@@ -47,7 +47,7 @@ steps:
commands: commands:
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2 parallelism: 5
- label: Kernels Mamba Test - label: Kernels Mamba Test
timeout_in_minutes: 45 timeout_in_minutes: 45
......
...@@ -59,7 +59,7 @@ steps: ...@@ -59,7 +59,7 @@ steps:
- vllm/model_executor/models/qwen3_next_mtp.py - vllm/model_executor/models/qwen3_next_mtp.py
- vllm/model_executor/layers/fla/ops/ - vllm/model_executor/layers/fla/ops/
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
- label: LM Eval Large Models (H200) - label: LM Eval Large Models (H200)
timeout_in_minutes: 60 timeout_in_minutes: 60
......
...@@ -8,7 +8,7 @@ steps: ...@@ -8,7 +8,7 @@ steps:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
commands: commands:
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemoel_lora.py
parallelism: 4 parallelism: 4
...@@ -30,4 +30,5 @@ steps: ...@@ -30,4 +30,5 @@ steps:
- pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py - pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py - pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file - pytest -v -s -x lora/test_qwen35_densemoel_lora.py
\ No newline at end of file
...@@ -9,9 +9,9 @@ steps: ...@@ -9,9 +9,9 @@ steps:
- vllm/config/model.py - vllm/config/model.py
- vllm/model_executor - vllm/model_executor
- tests/model_executor - tests/model_executor
- tests/entrypoints/openai/test_tensorizer_entrypoint.py - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
commands: commands:
- apt-get update && apt-get install -y curl libsodium23 - apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor - pytest -v -s model_executor
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
...@@ -11,7 +11,7 @@ steps: ...@@ -11,7 +11,7 @@ steps:
- vllm/v1/attention/ - vllm/v1/attention/
- tests/v1/engine/test_llm_engine.py - tests/v1/engine/test_llm_engine.py
- tests/v1/e2e/ - tests/v1/e2e/
- tests/v1/entrypoints/llm/test_struct_output_generate.py - tests/entrypoints/llm/test_struct_output_generate.py
commands: commands:
- set -x - set -x
- export VLLM_USE_V2_MODEL_RUNNER=1 - export VLLM_USE_V2_MODEL_RUNNER=1
...@@ -22,7 +22,7 @@ steps: ...@@ -22,7 +22,7 @@ steps:
- pytest -v -s v1/e2e/general/test_context_length.py - pytest -v -s v1/e2e/general/test_context_length.py
- pytest -v -s v1/e2e/general/test_min_tokens.py - pytest -v -s v1/e2e/general/test_min_tokens.py
# Temporary hack filter to exclude ngram spec decoding based tests. # Temporary hack filter to exclude ngram spec decoding based tests.
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" - pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
- label: Model Runner V2 Examples - label: Model Runner V2 Examples
timeout_in_minutes: 45 timeout_in_minutes: 45
......
...@@ -62,7 +62,7 @@ steps: ...@@ -62,7 +62,7 @@ steps:
depends_on: depends_on:
- image-build-amd - image-build-amd
- label: Multi-Modal Processor Test (CPU) - label: Multi-Modal Processor (CPU)
depends_on: depends_on:
- image-build-cpu - image-build-cpu
timeout_in_minutes: 60 timeout_in_minutes: 60
...@@ -95,34 +95,44 @@ steps: ...@@ -95,34 +95,44 @@ steps:
commands: commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models (Extended) 1 - label: Multi-Modal Models (Extended Generation 1)
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal/generation
- tests/models/multimodal/test_mapping.py
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
- pytest -v -s models/multimodal/test_mapping.py
mirror: mirror:
amd: amd:
device: mi325_1 device: mi325_1
depends_on: depends_on:
- image-build-amd - image-build-amd
- label: Multi-Modal Models (Extended) 2 - label: Multi-Modal Models (Extended Generation 2)
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal/generation
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models (Extended) 3 - label: Multi-Modal Models (Extended Generation 3)
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal/generation
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
- label: Multi-Modal Models (Extended Pooling)
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal/pooling
commands:
- pytest -v -s models/multimodal/pooling -m 'not core_model'
...@@ -36,6 +36,6 @@ steps: ...@@ -36,6 +36,6 @@ steps:
- pytest -v -s plugins_tests/test_scheduler_plugins.py - pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py - pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
...@@ -35,7 +35,7 @@ steps: ...@@ -35,7 +35,7 @@ steps:
# as it is a heavy test that is covered in other steps. # as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that # Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965 # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;" - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph - label: PyTorch Fullgraph
timeout_in_minutes: 30 timeout_in_minutes: 30
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment