Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
...@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" ...@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"
......
...@@ -39,7 +39,7 @@ docker run \ ...@@ -39,7 +39,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
cd tests cd tests
pytest -v -s v1/core pytest -v -s v1/core
pytest -v -s v1/engine pytest -v -s v1/engine
......
...@@ -2,6 +2,17 @@ ...@@ -2,6 +2,17 @@
set -euox pipefail set -euox pipefail
# To detect ROCm
# Check multiple indicators:
if [ -e /dev/kfd ] || \
[ -d /opt/rocm ] || \
command -v rocm-smi &> /dev/null || \
[ -n "${ROCM_HOME:-}" ]; then
IS_ROCM=1
else
IS_ROCM=0
fi
if [[ $# -lt 4 ]]; then if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1 exit 1
...@@ -26,13 +37,18 @@ for command in "${COMMANDS[@]}"; do ...@@ -26,13 +37,18 @@ for command in "${COMMANDS[@]}"; do
echo "$command" echo "$command"
done done
start_network() { start_network() {
docker network create --subnet=192.168.10.0/24 docker-net docker network create --subnet=192.168.10.0/24 docker-net
} }
start_nodes() { start_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do for node in $(seq 0 $(($NUM_NODES-1))); do
GPU_DEVICES='"device=' if [ "$IS_ROCM" -eq 1 ]; then
GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
else
GPU_DEVICES='--gpus "device='
fi
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM)) GPU_DEVICES+=$(($DEVICE_NUM))
...@@ -40,7 +56,9 @@ start_nodes() { ...@@ -40,7 +56,9 @@ start_nodes() {
GPU_DEVICES+=',' GPU_DEVICES+=','
fi fi
done done
GPU_DEVICES+='"' if [ "$IS_ROCM" -eq 0 ]; then
GPU_DEVICES+='"'
fi
# start the container in detached mode # start the container in detached mode
# things to note: # things to note:
...@@ -49,7 +67,7 @@ start_nodes() { ...@@ -49,7 +67,7 @@ start_nodes() {
# 3. map the huggingface cache directory to the container # 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11) # starting from 192.168.10.11)
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \ docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \ -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \ --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
/bin/bash -c "tail -f /dev/null" /bin/bash -c "tail -f /dev/null"
......
...@@ -44,10 +44,10 @@ trap cleanup EXIT ...@@ -44,10 +44,10 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \ VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--enable-eplb \ --enable-eplb \
--all2all-backend $BACK \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \ --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--data-parallel-size ${DATA_PARALLEL_SIZE} \ --data-parallel-size ${DATA_PARALLEL_SIZE} \
......
...@@ -43,12 +43,12 @@ trap cleanup EXIT ...@@ -43,12 +43,12 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \ VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--tensor-parallel-size 4 \ --tensor-parallel-size 4 \
--enable-expert-parallel \ --enable-expert-parallel \
--enable-eplb \ --enable-eplb \
--all2all-backend $BACK \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \ --trust-remote-code \
......
...@@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then ...@@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/" echo "Re-generating indices for /$pure_version/"
rm -rf "$INDICES_OUTPUT_DIR/*" rm -rf "$INDICES_OUTPUT_DIR/*"
mkdir -p "$INDICES_OUTPUT_DIR" mkdir -p "$INDICES_OUTPUT_DIR"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi fi
#!/usr/bin/env bash
set -e
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
echo "Release version from Buildkite: $RELEASE_VERSION"
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
if [ -z "$GIT_VERSION" ]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
exit 1
fi
fi
# check pypi token
if [ -z "$PYPI_TOKEN" ]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
export TWINE_USERNAME="__token__"
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
# check github token
if [ -z "$GITHUB_TOKEN" ]; then
echo "[FATAL] GITHUB_TOKEN is not set."
exit 1
else
export GH_TOKEN="$GITHUB_TOKEN"
fi
set -x # avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
if [ -z "$GH_VERSION" ]; then
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
exit 1
fi
echo "Downloading gh CLI version: $GH_VERSION"
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
GH_INSTALL_DIR="/tmp/gh-install"
mkdir -p "$GH_INSTALL_DIR"
pushd "$GH_INSTALL_DIR"
curl -L -o "$GH_TARBALL" "$GH_URL"
tar -xzf "$GH_TARBALL"
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
if [ -z "$GH_BIN" ]; then
echo "[FATAL] Failed to find gh CLI executable"
exit 1
fi
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
command "$GH_BIN" release list --limit 5
popd
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
pip install twine
python3 -m twine --version
# copy release wheels to local directory
DIST_DIR=/tmp/vllm-release-dist
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory"
mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*")
if [ -z "$PYPI_WHEEL_FILES" ]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Upload ROCm wheels to S3 with proper index generation
#
# Required environment variables:
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
# S3_BUCKET (default: vllm-wheels)
#
# S3 path structure:
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
set -ex
# ======== Configuration ========
BUCKET="${S3_BUCKET:-vllm-wheels}"
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
INDICES_OUTPUT_DIR="rocm-indices"
PYTHON="${PYTHON_PROG:-python3}"
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
MANYLINUX_VERSION="manylinux_2_35"
echo "========================================"
echo "ROCm Wheel Upload Configuration"
echo "========================================"
echo "S3 Bucket: $BUCKET"
echo "S3 Path: $ROCM_SUBPATH"
echo "Commit: $BUILDKITE_COMMIT"
echo "Branch: $BUILDKITE_BRANCH"
echo "========================================"
# ======== Part 0: Setup Python ========
# Detect if python3.12+ is available
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
if [[ "$has_new_python" -eq 0 ]]; then
# Use new python from docker
# Use --user to ensure files are created with correct ownership (not root)
docker pull python:3-slim
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
fi
echo "Using python interpreter: $PYTHON"
echo "Python version: $($PYTHON --version)"
# ======== Part 1: Collect and prepare wheels ========
# Collect all wheels
mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then
echo "ERROR: No wheels found to upload!"
exit 1
fi
# Rename linux to manylinux in wheel filenames
for wheel in all-rocm-wheels/*.whl; do
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
mv -- "$wheel" "$new_wheel"
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
fi
done
echo ""
echo "Wheels to upload:"
ls -lh all-rocm-wheels/
# ======== Part 2: Upload wheels to S3 ========
echo ""
echo "Uploading wheels to $S3_COMMIT_PREFIX"
for wheel in all-rocm-wheels/*.whl; do
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
done
# ======== Part 3: Generate and upload indices ========
# List existing wheels in commit directory
echo ""
echo "Generating indices..."
obj_json="rocm-objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# Use the existing generate-nightly-index.py
# HACK: Replace regex module with stdlib re (same as CUDA script)
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py \
--version "$ROCM_SUBPATH" \
--current-objects "$obj_json" \
--output-dir "$INDICES_OUTPUT_DIR" \
--comment "ROCm commit $BUILDKITE_COMMIT"
# Upload indices to commit directory
echo "Uploading indices to $S3_COMMIT_PREFIX"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
# Update rocm/nightly/ if on main branch and not a PR
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
echo "Updating rocm/nightly/ index..."
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
fi
# Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION"
PURE_VERSION="${VERSION%%+*}"
PURE_VERSION="${PURE_VERSION%%.rocm}"
echo "Pure version: $PURE_VERSION"
if [[ "$VERSION" != *"dev"* ]]; then
echo "Updating rocm/$PURE_VERSION/ index..."
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
fi
fi
# ======== Part 4: Summary ========
echo ""
echo "========================================"
echo "ROCm Wheel Upload Complete!"
echo "========================================"
echo ""
echo "Wheels available at:"
echo " s3://$BUCKET/$ROCM_SUBPATH/"
echo ""
echo "Install command (by commit):"
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
echo ""
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
echo "Install command (nightly):"
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
fi
echo ""
echo "Wheel count: $WHEEL_COUNT"
echo "========================================"
...@@ -128,7 +128,7 @@ steps: ...@@ -128,7 +128,7 @@ steps:
- tests/entrypoints/ - tests/entrypoints/
commands: commands:
- pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration Test (LLM) # 30min - label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40 timeout_in_minutes: 40
...@@ -148,7 +148,7 @@ steps: ...@@ -148,7 +148,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min - label: Entrypoints Integration Test (API Server 1) # 100min
timeout_in_minutes: 130 timeout_in_minutes: 130
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
...@@ -162,10 +162,28 @@ steps: ...@@ -162,10 +162,28 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (API Server 2)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/sleep
- tests/entrypoints/rpc
- tests/tool_use
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/sleep
- pytest -v -s tool_use
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- label: Entrypoints Integration Test (Pooling) - label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -181,6 +199,21 @@ steps: ...@@ -181,6 +199,21 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling - pytest -v -s entrypoints/pooling
- label: Entrypoints Integration Test (Responses API)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/openai/responses
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai/responses
- label: Distributed Tests (4 GPUs) # 35min - label: Distributed Tests (4 GPUs) # 35min
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -201,6 +234,9 @@ steps: ...@@ -201,6 +234,9 @@ steps:
- tests/v1/engine/test_engine_core_client.py - tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py - tests/distributed/test_symm_mem_allreduce.py
commands: commands:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
# test with torchrun tp=2 and external_dp=2 # test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2 # test with torchrun tp=2 and pp=2
...@@ -249,9 +285,10 @@ steps: ...@@ -249,9 +285,10 @@ steps:
- vllm/v1/executor/uniproc_executor.py - vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py - vllm/v1/worker/gpu_worker.py
commands: commands:
# https://github.com/NVIDIA/nccl/issues/1838
#- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep # test with torchrun tp=2 and dp=4 with ep
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min - label: EPLB Algorithm Test # 5min
...@@ -331,7 +368,9 @@ steps: ...@@ -331,7 +368,9 @@ steps:
- label: V1 Test e2e + engine # 65min - label: V1 Test e2e + engine # 65min
timeout_in_minutes: 90 timeout_in_minutes: 90
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
agent_pool: mi325_8
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
...@@ -492,8 +531,7 @@ steps: ...@@ -492,8 +531,7 @@ steps:
- tests/samplers - tests/samplers
- tests/conftest.py - tests/conftest.py
commands: commands:
- pytest -v -s samplers - pytest -v -s -m 'not skip_v1' samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LoRA Test %N # 20min each - label: LoRA Test %N # 20min each
timeout_in_minutes: 30 timeout_in_minutes: 30
...@@ -707,7 +745,7 @@ steps: ...@@ -707,7 +745,7 @@ steps:
- label: Quantization Test # 70min - label: Quantization Test # 70min
timeout_in_minutes: 90 timeout_in_minutes: 90
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
...@@ -722,7 +760,7 @@ steps: ...@@ -722,7 +760,7 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 - uv pip install --system torchao==0.14.1
- uv pip install --system conch-triton-kernels - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
...@@ -736,7 +774,7 @@ steps: ...@@ -736,7 +774,7 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
autorun_on_main: true autorun_on_main: true
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: OpenAI API correctness # 10min - label: OpenAI API correctness # 10min
timeout_in_minutes: 15 timeout_in_minutes: 15
...@@ -747,21 +785,11 @@ steps: ...@@ -747,21 +785,11 @@ steps:
- csrc/ - csrc/
- vllm/entrypoints/openai/ - vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py - vllm/model_executor/models/whisper.py
- tools/
commands: # LMEval+Transcription WER check commands: # LMEval+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - bash ../tools/install_torchcodec_rocm.sh || exit 1
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
##### models test ##### ##### models test #####
...@@ -854,6 +882,7 @@ steps: ...@@ -854,6 +882,7 @@ steps:
# Shard slow subset of standard language models tests. Only run when model # Shard slow subset of standard language models tests. Only run when model
# source is modified, or when specified test files are modified # source is modified, or when specified test files are modified
- pip freeze | grep -E 'torch' - pip freeze | grep -E 'torch'
- export TORCH_NCCL_BLOCKING_WAIT=1
- pytest -v -s models/language -m 'core_model and slow_test' \ - pytest -v -s models/language -m 'core_model and slow_test' \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--shard-id=$$BUILDKITE_PARALLEL_JOB --shard-id=$$BUILDKITE_PARALLEL_JOB
...@@ -871,7 +900,7 @@ steps: ...@@ -871,7 +900,7 @@ steps:
commands: commands:
# Install fast path packages for testing against transformers # Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM # Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests # Shard hybrid language model tests
- pytest -v -s models/language/generation \ - pytest -v -s models/language/generation \
...@@ -892,7 +921,7 @@ steps: ...@@ -892,7 +921,7 @@ steps:
commands: commands:
# Install fast path packages for testing against transformers # Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM # Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
...@@ -957,7 +986,7 @@ steps: ...@@ -957,7 +986,7 @@ steps:
- pytest -v -s models/multimodal/processing - pytest -v -s models/multimodal/processing
- label: Multi-Modal Models Test (Standard) # 60min - label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80 timeout_in_minutes: 100
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
...@@ -966,13 +995,16 @@ steps: ...@@ -966,13 +995,16 @@ steps:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch' - pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
- pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min - label: Multi-Modal Accuracy Eval (Small Models) # 5min
timeout_in_minutes: 180 timeout_in_minutes: 10
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
...@@ -982,7 +1014,9 @@ steps: ...@@ -982,7 +1014,9 @@ steps:
- vllm/inputs/ - vllm/inputs/
- vllm/v1/core/ - vllm/v1/core/
commands: commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
- label: Multi-Modal Models Test (Extended) 1 # 60min - label: Multi-Modal Models Test (Extended) 1 # 60min
timeout_in_minutes: 120 timeout_in_minutes: 120
...@@ -994,10 +1028,13 @@ steps: ...@@ -994,10 +1028,13 @@ steps:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
- label: Multi-Modal Models Test (Extended) 2 - label: Multi-Modal Models Test (Extended) 2 #60min
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
...@@ -1006,6 +1043,8 @@ steps: ...@@ -1006,6 +1043,8 @@ steps:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
...@@ -1019,6 +1058,8 @@ steps: ...@@ -1019,6 +1058,8 @@ steps:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
...@@ -1078,8 +1119,8 @@ steps: ...@@ -1078,8 +1119,8 @@ steps:
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py - vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py - vllm/platforms/cuda.py
- vllm/attention/selector.py
commands: commands:
- nvidia-smi - nvidia-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
...@@ -1196,7 +1237,7 @@ steps: ...@@ -1196,7 +1237,7 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
...@@ -1236,13 +1277,13 @@ steps: ...@@ -1236,13 +1277,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min - label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90 timeout_in_minutes: 90
...@@ -1268,6 +1309,9 @@ steps: ...@@ -1268,6 +1309,9 @@ steps:
- tests/v1/shutdown - tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py - tests/v1/worker/test_worker_memory_snapshot.py
commands: commands:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
...@@ -1417,8 +1461,22 @@ steps: ...@@ -1417,8 +1461,22 @@ steps:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/ - tests/v1/kv_connector/nixl_integration/
commands: commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
- VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
##### multi gpus test ##### ##### multi gpus test #####
##### A100 test ##### ##### A100 test #####
...@@ -1490,7 +1548,7 @@ steps: ...@@ -1490,7 +1548,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test ##### ##### B200 test #####
...@@ -1514,7 +1572,7 @@ steps: ...@@ -1514,7 +1572,7 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: LM Eval Large Models (4 Card) - label: LM Eval Large Models (4 Card)
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
...@@ -1569,6 +1627,8 @@ steps: ...@@ -1569,6 +1627,8 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
##### EPLB Accuracy Tests #####
- label: DeepSeek V2-Lite Accuracy - label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4 agent_pool: mi325_4
......
...@@ -114,7 +114,7 @@ steps: ...@@ -114,7 +114,7 @@ steps:
- tests/entrypoints/ - tests/entrypoints/
commands: commands:
- pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration Test (LLM) # 30min - label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40 timeout_in_minutes: 40
...@@ -132,7 +132,7 @@ steps: ...@@ -132,7 +132,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min - label: Entrypoints Integration Test (API Server 1) # 100min
timeout_in_minutes: 130 timeout_in_minutes: 130
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
...@@ -144,10 +144,26 @@ steps: ...@@ -144,10 +144,26 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (API Server 2)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/sleep
- tests/entrypoints/rpc
- tests/tool_use
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/sleep
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s tool_use
- label: Entrypoints Integration Test (Pooling) - label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -161,6 +177,18 @@ steps: ...@@ -161,6 +177,18 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling - pytest -v -s entrypoints/pooling
- label: Entrypoints Integration Test (Responses API)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/openai/responses
commands:
- pytest -v -s entrypoints/openai/responses
- label: Distributed Tests (4 GPUs) # 35min - label: Distributed Tests (4 GPUs) # 35min
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -303,7 +331,10 @@ steps: ...@@ -303,7 +331,10 @@ steps:
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100. # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e - pytest -v -s v1/e2e
- pytest -v -s v1/engine # Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
- label: V1 Test entrypoints # 35min - label: V1 Test entrypoints # 35min
timeout_in_minutes: 50 timeout_in_minutes: 50
...@@ -642,7 +673,7 @@ steps: ...@@ -642,7 +673,7 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
...@@ -654,7 +685,7 @@ steps: ...@@ -654,7 +685,7 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
autorun_on_main: true autorun_on_main: true
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: OpenAI API correctness # 22min - label: OpenAI API correctness # 22min
timeout_in_minutes: 30 timeout_in_minutes: 30
...@@ -666,16 +697,6 @@ steps: ...@@ -666,16 +697,6 @@ steps:
commands: # LMEval+Transcription WER check commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
##### models test ##### ##### models test #####
- label: Basic Models Tests (Initialization) - label: Basic Models Tests (Initialization)
...@@ -934,7 +955,6 @@ steps: ...@@ -934,7 +955,6 @@ steps:
timeout_in_minutes: 30 timeout_in_minutes: 30
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 gpu: b200
# optional: true
source_file_dependencies: source_file_dependencies:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
- csrc/attention/mla/ - csrc/attention/mla/
...@@ -946,8 +966,8 @@ steps: ...@@ -946,8 +966,8 @@ steps:
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py - vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py - vllm/platforms/cuda.py
- vllm/attention/selector.py
commands: commands:
- nvidia-smi - nvidia-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
...@@ -1064,7 +1084,7 @@ steps: ...@@ -1064,7 +1084,7 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
...@@ -1096,17 +1116,18 @@ steps: ...@@ -1096,17 +1116,18 @@ steps:
- vllm/model_executor/models/ - vllm/model_executor/models/
- tests/distributed/ - tests/distributed/
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
- .buildkite/scripts/run-multi-node-test.sh
commands: commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min - label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90 timeout_in_minutes: 90
...@@ -1258,8 +1279,19 @@ steps: ...@@ -1258,8 +1279,19 @@ steps:
commands: commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- label: NixlConnector PD accuracy tests (Distributed) # 30min - label: NixlConnector PD accuracy tests (Distributed) # 40min
timeout_in_minutes: 30 timeout_in_minutes: 40
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
...@@ -1267,7 +1299,7 @@ steps: ...@@ -1267,7 +1299,7 @@ steps:
- tests/v1/kv_connector/nixl_integration/ - tests/v1/kv_connector/nixl_integration/
commands: commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
##### multi gpus test ##### ##### multi gpus test #####
...@@ -1325,9 +1357,17 @@ steps: ...@@ -1325,9 +1357,17 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
- label: LM Eval Large Models (H200) # optional
timeout_in_minutes: 60
gpu: h200
optional: true
num_gpus: 8
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
##### B200 test ##### ##### B200 test #####
- label: Distributed Tests (B200) # optional - label: Distributed Tests (B200) # optional
gpu: b200 gpu: b200
...@@ -1350,6 +1390,7 @@ steps: ...@@ -1350,6 +1390,7 @@ steps:
- vllm/ - vllm/
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy - label: DeepSeek V2-Lite Accuracy
...@@ -1378,3 +1419,26 @@ steps: ...@@ -1378,3 +1419,26 @@ steps:
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
##### MoE Refactor (Temporary) Tests #####
- label: MoE Refactor Integration Test (H100 - TEMPORARY) # optional
gpu: h100
optional: true
num_gpus: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
- label: MoE Refactor Integration Test (B200 - TEMPORARY) # optional
gpu: b200
optional: true
num_gpus: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) # optional
gpu: b200
optional: true
num_gpus: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
...@@ -145,7 +145,7 @@ steps: ...@@ -145,7 +145,7 @@ steps:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200) - label: Distributed Tests (2 GPUs)(B200)
...@@ -171,7 +171,7 @@ steps: ...@@ -171,7 +171,7 @@ steps:
- tests/distributed/ - tests/distributed/
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
commands: commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs) - label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30
...@@ -182,7 +182,7 @@ steps: ...@@ -182,7 +182,7 @@ steps:
- tests/v1/kv_connector/nixl_integration/ - tests/v1/kv_connector/nixl_integration/
commands: commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)) - label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 60 timeout_in_minutes: 60
......
...@@ -32,6 +32,7 @@ steps: ...@@ -32,6 +32,7 @@ steps:
- label: Prime-RL Integration (2 GPUs) - label: Prime-RL Integration (2 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30
optional: true optional: true
soft_fail: true
num_gpus: 2 num_gpus: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
source_file_dependencies: source_file_dependencies:
...@@ -39,21 +40,3 @@ steps: ...@@ -39,21 +40,3 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
...@@ -10,7 +10,7 @@ steps: ...@@ -10,7 +10,7 @@ steps:
- tests/entrypoints/ - tests/entrypoints/
commands: commands:
- pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration (LLM) - label: Entrypoints Integration (LLM)
timeout_in_minutes: 40 timeout_in_minutes: 40
...@@ -25,7 +25,7 @@ steps: ...@@ -25,7 +25,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration (API Server) - label: Entrypoints Integration (API Server 1)
timeout_in_minutes: 130 timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
...@@ -34,10 +34,24 @@ steps: ...@@ -34,10 +34,24 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration (API Server 2)
timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/tool_use
- tests/entrypoints/sleep
- tests/entrypoints/instrumentator
- tests/entrypoints/rpc
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s entrypoints/instrumentator
- pytest -v -s entrypoints/sleep
- pytest -v -s tool_use
- label: Entrypoints Integration (Pooling) - label: Entrypoints Integration (Pooling)
timeout_in_minutes: 50 timeout_in_minutes: 50
...@@ -49,6 +63,14 @@ steps: ...@@ -49,6 +63,14 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling - pytest -v -s entrypoints/pooling
- label: Entrypoints Integration (Responses API)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/openai/responses
commands:
- pytest -v -s entrypoints/openai/responses
- label: Entrypoints V1 - label: Entrypoints V1
timeout_in_minutes: 50 timeout_in_minutes: 50
......
...@@ -90,8 +90,8 @@ steps: ...@@ -90,8 +90,8 @@ steps:
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py - vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py - vllm/platforms/cuda.py
- vllm/attention/selector.py
commands: commands:
- nvidia-smi - nvidia-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
......
...@@ -9,7 +9,7 @@ steps: ...@@ -9,7 +9,7 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
autorun_on_main: true autorun_on_main: true
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: LM Eval Large Models (4 GPUs)(A100) - label: LM Eval Large Models (4 GPUs)(A100)
gpu: a100 gpu: a100
...@@ -43,4 +43,4 @@ steps: ...@@ -43,4 +43,4 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
...@@ -22,6 +22,8 @@ steps: ...@@ -22,6 +22,8 @@ steps:
# FIXIT: find out which code initialize cuda before running the test # FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it # before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# Alot of these tests are on the edge of OOMing
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# There is some Tensor Parallelism related processing logic in LoRA that # There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation. # requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_chatglm3_tp.py
......
...@@ -9,6 +9,7 @@ steps: ...@@ -9,6 +9,7 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/test_initialization.py - tests/models/test_initialization.py
- tests/models/registry.py
commands: commands:
# Run a subset of model initialization tests # Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
...@@ -20,6 +21,7 @@ steps: ...@@ -20,6 +21,7 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/models/ - vllm/model_executor/models/
- tests/models/test_initialization.py - tests/models/test_initialization.py
- tests/models/registry.py
commands: commands:
# Only when vLLM model source is modified - test initialization of a large # Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above # subset of supported models (the complement of the small subset in the above
......
...@@ -13,7 +13,9 @@ steps: ...@@ -13,7 +13,9 @@ steps:
# tests covered elsewhere. # tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that # Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965 # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;" # However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test - label: PyTorch Fullgraph Smoke Test
timeout_in_minutes: 30 timeout_in_minutes: 30
......
group: Tool use
depends_on:
- image-build
steps:
- label: OpenAI-Compatible Tool Use
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
# This lists cover the "core" components of vLLM that require careful review # This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson /vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
...@@ -15,6 +14,7 @@ ...@@ -15,6 +14,7 @@
/vllm/lora @jeejeelee /vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang /vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang /vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg /vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC /vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson CMakeLists.txt @tlrmchlsmth @LucasWilkinson
...@@ -26,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson ...@@ -26,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# vLLM V1 # vLLM V1
/vllm/v1/attention @LucasWilkinson /vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/v1/attention/backends/mla @pavanimajety /vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep
...@@ -116,15 +117,15 @@ mkdocs.yaml @hmellor ...@@ -116,15 +117,15 @@ mkdocs.yaml @hmellor
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels # Kernels
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/attention/ops/triton_unified_attention.py @tdoublep /vllm/v1/attention/ops/triton_unified_attention.py @tdoublep
# ROCm related: specify owner with write access to notify AMD folks for careful code review # ROCm related: specify owner with write access to notify AMD folks for careful code review
/vllm/**/*rocm* @tjtanaa /vllm/**/*rocm* @tjtanaa
/docker/Dockerfile.rocm* @gshtras @tjtanaa /docker/Dockerfile.rocm* @gshtras @tjtanaa
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa /vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa /vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
/vllm/attention/ops/rocm*.py @gshtras @tjtanaa /vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
/csrc/rocm @gshtras @tjtanaa /csrc/rocm @gshtras @tjtanaa
/requirements/*rocm* @tjtanaa /requirements/*rocm* @tjtanaa
...@@ -152,7 +153,7 @@ mkdocs.yaml @hmellor ...@@ -152,7 +153,7 @@ mkdocs.yaml @hmellor
/vllm/entrypoints/pooling @noooop /vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop /vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop /vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop /vllm/model_executor/layers/pooler @noooop
# Security guide and policies # Security guide and policies
/docs/usage/security.md @russellb /docs/usage/security.md @russellb
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment