Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
1000
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
517 additions
and
121 deletions
+517
-121
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+1
-1
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
+1
-1
.buildkite/scripts/run-multi-node-test.sh
.buildkite/scripts/run-multi-node-test.sh
+21
-3
.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
...heduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+1
-1
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
...s/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+1
-1
.buildkite/scripts/upload-nightly-wheels.sh
.buildkite/scripts/upload-nightly-wheels.sh
+2
-1
.buildkite/scripts/upload-release-wheels.sh
.buildkite/scripts/upload-release-wheels.sh
+103
-0
.buildkite/scripts/upload-rocm-wheels.sh
.buildkite/scripts/upload-rocm-wheels.sh
+151
-0
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+100
-40
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+90
-26
.buildkite/test_areas/distributed.yaml
.buildkite/test_areas/distributed.yaml
+3
-3
.buildkite/test_areas/e2e_integration.yaml
.buildkite/test_areas/e2e_integration.yaml
+1
-18
.buildkite/test_areas/entrypoints.yaml
.buildkite/test_areas/entrypoints.yaml
+26
-4
.buildkite/test_areas/kernels.yaml
.buildkite/test_areas/kernels.yaml
+1
-1
.buildkite/test_areas/lm_eval.yaml
.buildkite/test_areas/lm_eval.yaml
+2
-2
.buildkite/test_areas/lora.yaml
.buildkite/test_areas/lora.yaml
+2
-0
.buildkite/test_areas/models_basic.yaml
.buildkite/test_areas/models_basic.yaml
+2
-0
.buildkite/test_areas/pytorch.yaml
.buildkite/test_areas/pytorch.yaml
+3
-1
.buildkite/test_areas/tool_use.yaml
.buildkite/test_areas/tool_use.yaml
+0
-13
.github/CODEOWNERS
.github/CODEOWNERS
+6
-5
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
View file @
7e63ef82
...
...
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval
@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
" \
&& python3 -m pip install --progress-bar off "lm-eval
[api]>=0.4.9.2
" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
...
...
.buildkite/scripts/hardware_ci/run-xpu-test.sh
View file @
7e63ef82
...
...
@@ -39,7 +39,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
--attention-backend=TRITON_ATTN
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
...
...
.buildkite/scripts/run-multi-node-test.sh
View file @
7e63ef82
...
...
@@ -2,6 +2,17 @@
set
-euox
pipefail
# To detect ROCm
# Check multiple indicators:
if
[
-e
/dev/kfd
]
||
\
[
-d
/opt/rocm
]
||
\
command
-v
rocm-smi &> /dev/null
||
\
[
-n
"
${
ROCM_HOME
:-}
"
]
;
then
IS_ROCM
=
1
else
IS_ROCM
=
0
fi
if
[[
$#
-lt
4
]]
;
then
echo
"Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit
1
...
...
@@ -26,13 +37,18 @@ for command in "${COMMANDS[@]}"; do
echo
"
$command
"
done
start_network
()
{
docker network create
--subnet
=
192.168.10.0/24 docker-net
}
start_nodes
()
{
for
node
in
$(
seq
0
$((
$NUM_NODES
-
1
))
)
;
do
GPU_DEVICES
=
'"device='
if
[
"
$IS_ROCM
"
-eq
1
]
;
then
GPU_DEVICES
=
'--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
else
GPU_DEVICES
=
'--gpus "device='
fi
for
node_gpu
in
$(
seq
0
$((
$NUM_GPUS
-
1
))
)
;
do
DEVICE_NUM
=
$((
$node
*
$NUM_GPUS
+
$node_gpu
))
GPU_DEVICES+
=
$((
$DEVICE_NUM
))
...
...
@@ -40,7 +56,9 @@ start_nodes() {
GPU_DEVICES+
=
','
fi
done
if
[
"
$IS_ROCM
"
-eq
0
]
;
then
GPU_DEVICES+
=
'"'
fi
# start the container in detached mode
# things to note:
...
...
@@ -49,7 +67,7 @@ start_nodes() {
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
docker run
-d
--gpus
"
$GPU_DEVICES
"
--shm-size
=
10.24gb
-e
HF_TOKEN
\
docker run
-d
$GPU_DEVICES
--shm-size
=
10.24gb
-e
HF_TOKEN
\
-v
~/.cache/huggingface:/root/.cache/huggingface
--name
"node
$node
"
\
--network
docker-net
--ip
192.168.10.
$((
10
+
$node
))
--rm
"
$DOCKER_IMAGE
"
\
/bin/bash
-c
"tail -f /dev/null"
...
...
.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
View file @
7e63ef82
...
...
@@ -44,10 +44,10 @@ trap cleanup EXIT
for
BACK
in
"
${
BACKENDS
[@]
}
"
;
do
VLLM_DEEP_GEMM_WARMUP
=
skip
\
VLLM_ALL2ALL_BACKEND
=
$BACK
\
vllm serve
"
$MODEL
"
\
--enforce-eager
\
--enable-eplb
\
--all2all-backend
$BACK
\
--eplb-config
'{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}'
\
--tensor-parallel-size
${
TENSOR_PARALLEL_SIZE
}
\
--data-parallel-size
${
DATA_PARALLEL_SIZE
}
\
...
...
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
View file @
7e63ef82
...
...
@@ -43,12 +43,12 @@ trap cleanup EXIT
for
BACK
in
"
${
BACKENDS
[@]
}
"
;
do
VLLM_DEEP_GEMM_WARMUP
=
skip
\
VLLM_ALL2ALL_BACKEND
=
$BACK
\
vllm serve
"
$MODEL
"
\
--enforce-eager
\
--tensor-parallel-size
4
\
--enable-expert-parallel
\
--enable-eplb
\
--all2all-backend
$BACK
\
--eplb-config
'{"window_size":200,"step_interval":600,"use_async":true}'
\
--speculative-config
'{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
\
--trust-remote-code
\
...
...
.buildkite/scripts/upload-wheels.sh
→
.buildkite/scripts/upload-
nightly-
wheels.sh
View file @
7e63ef82
...
...
@@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then
echo
"Re-generating indices for /
$pure_version
/"
rm
-rf
"
$INDICES_OUTPUT_DIR
/*"
mkdir
-p
"
$INDICES_OUTPUT_DIR
"
$PYTHON
.buildkite/scripts/generate-nightly-index.py
--version
"
$pure_version
"
--current-objects
"
$obj_json
"
--output-dir
"
$INDICES_OUTPUT_DIR
"
--comment
"version
$pure_version
"
$alias_arg
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON
.buildkite/scripts/generate-nightly-index.py
--version
"
$pure_version
"
--wheel-dir
"
$SUBPATH
"
--current-objects
"
$obj_json
"
--output-dir
"
$INDICES_OUTPUT_DIR
"
--comment
"version
$pure_version
"
$alias_arg
aws s3
cp
--recursive
"
$INDICES_OUTPUT_DIR
/"
"s3://
$BUCKET
/
$pure_version
/"
fi
.buildkite/scripts/upload-release-wheels.sh
0 → 100644
View file @
7e63ef82
#!/usr/bin/env bash
set
-e
BUCKET
=
"vllm-wheels"
SUBPATH
=
$BUILDKITE_COMMIT
S3_COMMIT_PREFIX
=
"s3://
$BUCKET
/
$SUBPATH
/"
RELEASE_VERSION
=
$(
buildkite-agent meta-data get release-version
)
echo
"Release version from Buildkite:
$RELEASE_VERSION
"
GIT_VERSION
=
$(
git describe
--exact-match
--tags
$BUILDKITE_COMMIT
2>/dev/null
)
if
[
-z
"
$GIT_VERSION
"
]
;
then
echo
"[FATAL] Not on a git tag, cannot create release."
exit
1
else
echo
"Git version for commit
$BUILDKITE_COMMIT
:
$GIT_VERSION
"
fi
# sanity check for version mismatch
if
[
"v
$RELEASE_VERSION
"
!=
"
$GIT_VERSION
"
]
;
then
if
[
"
$FORCE_RELEASE_IGNORE_VERSION_MISMATCH
"
==
"true"
]
;
then
echo
"[WARNING] Force release and ignore version mismatch"
else
echo
"[FATAL] Release version from Buildkite does not match Git version."
exit
1
fi
fi
# check pypi token
if
[
-z
"
$PYPI_TOKEN
"
]
;
then
echo
"[FATAL] PYPI_TOKEN is not set."
exit
1
else
export
TWINE_USERNAME
=
"__token__"
export
TWINE_PASSWORD
=
"
$PYPI_TOKEN
"
fi
# check github token
if
[
-z
"
$GITHUB_TOKEN
"
]
;
then
echo
"[FATAL] GITHUB_TOKEN is not set."
exit
1
else
export
GH_TOKEN
=
"
$GITHUB_TOKEN
"
fi
set
-x
# avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION
=
$(
curl
-s
https://api.github.com/repos/cli/cli/releases/latest |
grep
'"tag_name":'
|
sed
-E
's/.*"([^"]+)".*/\1/'
|
sed
's/^v//'
)
if
[
-z
"
$GH_VERSION
"
]
;
then
echo
"[FATAL] Failed to get latest gh CLI version from GitHub"
exit
1
fi
echo
"Downloading gh CLI version:
$GH_VERSION
"
GH_TARBALL
=
"gh_
${
GH_VERSION
}
_linux_amd64.tar.gz"
GH_URL
=
"https://github.com/cli/cli/releases/download/v
${
GH_VERSION
}
/
${
GH_TARBALL
}
"
GH_INSTALL_DIR
=
"/tmp/gh-install"
mkdir
-p
"
$GH_INSTALL_DIR
"
pushd
"
$GH_INSTALL_DIR
"
curl
-L
-o
"
$GH_TARBALL
"
"
$GH_URL
"
tar
-xzf
"
$GH_TARBALL
"
GH_BIN
=
$(
realpath
$(
find
.
-name
"gh"
-type
f
-executable
|
head
-n
1
))
if
[
-z
"
$GH_BIN
"
]
;
then
echo
"[FATAL] Failed to find gh CLI executable"
exit
1
fi
echo
"gh CLI downloaded successfully, version:
$(
$GH_BIN
--version
)
"
echo
"Last 5 releases on GitHub:"
# as a sanity check of gh and GH_TOKEN
command
"
$GH_BIN
"
release list
--limit
5
popd
# install twine from pypi
python3
-m
venv /tmp/vllm-release-env
source
/tmp/vllm-release-env/bin/activate
pip
install
twine
python3
-m
twine
--version
# copy release wheels to local directory
DIST_DIR
=
/tmp/vllm-release-dist
echo
"Existing wheels on S3:"
aws s3
ls
"
$S3_COMMIT_PREFIX
"
echo
"Copying wheels to local directory"
mkdir
-p
$DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
aws s3
cp
--recursive
--exclude
"*"
--include
"vllm-
${
RELEASE_VERSION
}
*.whl"
--exclude
"*dev*"
--exclude
"*rc*"
"
$S3_COMMIT_PREFIX
"
$DIST_DIR
echo
"Wheels copied to local directory"
# generate source tarball
git archive
--format
=
tar.gz
--output
=
"
$DIST_DIR
/vllm-
${
RELEASE_VERSION
}
.tar.gz"
$BUILDKITE_COMMIT
ls
-la
$DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES
=
$(
find
$DIST_DIR
-name
"vllm-
${
RELEASE_VERSION
}
*.whl"
-not
-name
"*+*"
)
if
[
-z
"
$PYPI_WHEEL_FILES
"
]
;
then
echo
"No default variant wheels found, quitting..."
exit
1
fi
python3
-m
twine check
$PYPI_WHEEL_FILES
python3
-m
twine
--non-interactive
--verbose
upload
$PYPI_WHEEL_FILES
echo
"Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command
"
$GH_BIN
"
release create
$GIT_VERSION
-d
--latest
--notes-from-tag
--verify-tag
$DIST_DIR
/
*
.whl
.buildkite/scripts/upload-rocm-wheels.sh
0 → 100755
View file @
7e63ef82
#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Upload ROCm wheels to S3 with proper index generation
#
# Required environment variables:
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
# S3_BUCKET (default: vllm-wheels)
#
# S3 path structure:
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
set
-ex
# ======== Configuration ========
BUCKET
=
"
${
S3_BUCKET
:-
vllm
-wheels
}
"
ROCM_SUBPATH
=
"rocm/
${
BUILDKITE_COMMIT
}
"
S3_COMMIT_PREFIX
=
"s3://
$BUCKET
/
$ROCM_SUBPATH
/"
INDICES_OUTPUT_DIR
=
"rocm-indices"
PYTHON
=
"
${
PYTHON_PROG
:-
python3
}
"
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
MANYLINUX_VERSION
=
"manylinux_2_35"
echo
"========================================"
echo
"ROCm Wheel Upload Configuration"
echo
"========================================"
echo
"S3 Bucket:
$BUCKET
"
echo
"S3 Path:
$ROCM_SUBPATH
"
echo
"Commit:
$BUILDKITE_COMMIT
"
echo
"Branch:
$BUILDKITE_BRANCH
"
echo
"========================================"
# ======== Part 0: Setup Python ========
# Detect if python3.12+ is available
has_new_python
=
$(
$PYTHON
-c
"print(1 if __import__('sys').version_info >= (3,12) else 0)"
2>/dev/null
||
echo
0
)
if
[[
"
$has_new_python
"
-eq
0
]]
;
then
# Use new python from docker
# Use --user to ensure files are created with correct ownership (not root)
docker pull python:3-slim
PYTHON
=
"docker run --rm --user
$(
id
-u
)
:
$(
id
-g
)
-v
$(
pwd
)
:/app -w /app python:3-slim python3"
fi
echo
"Using python interpreter:
$PYTHON
"
echo
"Python version:
$(
$PYTHON
--version
)
"
# ======== Part 1: Collect and prepare wheels ========
# Collect all wheels
mkdir
-p
all-rocm-wheels
cp
artifacts/rocm-base-wheels/
*
.whl all-rocm-wheels/ 2>/dev/null
||
true
cp
artifacts/rocm-vllm-wheel/
*
.whl all-rocm-wheels/ 2>/dev/null
||
true
WHEEL_COUNT
=
$(
ls
all-rocm-wheels/
*
.whl 2>/dev/null |
wc
-l
)
echo
"Total wheels to upload:
$WHEEL_COUNT
"
if
[
"
$WHEEL_COUNT
"
-eq
0
]
;
then
echo
"ERROR: No wheels found to upload!"
exit
1
fi
# Rename linux to manylinux in wheel filenames
for
wheel
in
all-rocm-wheels/
*
.whl
;
do
if
[[
"
$wheel
"
==
*
"linux"
*
]]
&&
[[
"
$wheel
"
!=
*
"manylinux"
*
]]
;
then
new_wheel
=
"
${
wheel
/linux/
$MANYLINUX_VERSION
}
"
mv
--
"
$wheel
"
"
$new_wheel
"
echo
"Renamed:
$(
basename
"
$wheel
"
)
->
$(
basename
"
$new_wheel
"
)
"
fi
done
echo
""
echo
"Wheels to upload:"
ls
-lh
all-rocm-wheels/
# ======== Part 2: Upload wheels to S3 ========
echo
""
echo
"Uploading wheels to
$S3_COMMIT_PREFIX
"
for
wheel
in
all-rocm-wheels/
*
.whl
;
do
aws s3
cp
"
$wheel
"
"
$S3_COMMIT_PREFIX
"
done
# ======== Part 3: Generate and upload indices ========
# List existing wheels in commit directory
echo
""
echo
"Generating indices..."
obj_json
=
"rocm-objects.json"
aws s3api list-objects-v2
--bucket
"
$BUCKET
"
--prefix
"
$ROCM_SUBPATH
/"
--delimiter
/
--output
json
>
"
$obj_json
"
mkdir
-p
"
$INDICES_OUTPUT_DIR
"
# Use the existing generate-nightly-index.py
# HACK: Replace regex module with stdlib re (same as CUDA script)
sed
-i
's/import regex as re/import re/g'
.buildkite/scripts/generate-nightly-index.py
$PYTHON
.buildkite/scripts/generate-nightly-index.py
\
--version
"
$ROCM_SUBPATH
"
\
--current-objects
"
$obj_json
"
\
--output-dir
"
$INDICES_OUTPUT_DIR
"
\
--comment
"ROCm commit
$BUILDKITE_COMMIT
"
# Upload indices to commit directory
echo
"Uploading indices to
$S3_COMMIT_PREFIX
"
aws s3
cp
--recursive
"
$INDICES_OUTPUT_DIR
/"
"
$S3_COMMIT_PREFIX
"
# Update rocm/nightly/ if on main branch and not a PR
if
[[
"
$BUILDKITE_BRANCH
"
==
"main"
&&
"
$BUILDKITE_PULL_REQUEST
"
==
"false"
]]
||
[[
"
$NIGHTLY
"
==
"1"
]]
;
then
echo
"Updating rocm/nightly/ index..."
aws s3
cp
--recursive
"
$INDICES_OUTPUT_DIR
/"
"s3://
$BUCKET
/rocm/nightly/"
fi
# Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL
=
$(
ls
all-rocm-wheels/vllm
*
.whl 2>/dev/null |
head
-1
)
if
[
-n
"
$VLLM_WHEEL
"
]
;
then
VERSION
=
$(
unzip
-p
"
$VLLM_WHEEL
"
'**/METADATA'
|
grep
'^Version: '
|
cut
-d
' '
-f2
)
echo
"Version in wheel:
$VERSION
"
PURE_VERSION
=
"
${
VERSION
%%+*
}
"
PURE_VERSION
=
"
${
PURE_VERSION
%%.rocm
}
"
echo
"Pure version:
$PURE_VERSION
"
if
[[
"
$VERSION
"
!=
*
"dev"
*
]]
;
then
echo
"Updating rocm/
$PURE_VERSION
/ index..."
aws s3
cp
--recursive
"
$INDICES_OUTPUT_DIR
/"
"s3://
$BUCKET
/rocm/
$PURE_VERSION
/"
fi
fi
# ======== Part 4: Summary ========
echo
""
echo
"========================================"
echo
"ROCm Wheel Upload Complete!"
echo
"========================================"
echo
""
echo
"Wheels available at:"
echo
" s3://
$BUCKET
/
$ROCM_SUBPATH
/"
echo
""
echo
"Install command (by commit):"
echo
" pip install vllm --extra-index-url https://
${
BUCKET
}
.s3.amazonaws.com/
$ROCM_SUBPATH
/"
echo
""
if
[[
"
$BUILDKITE_BRANCH
"
==
"main"
]]
||
[[
"
$NIGHTLY
"
==
"1"
]]
;
then
echo
"Install command (nightly):"
echo
" pip install vllm --extra-index-url https://
${
BUCKET
}
.s3.amazonaws.com/rocm/nightly/"
fi
echo
""
echo
"Wheel count:
$WHEEL_COUNT
"
echo
"========================================"
.buildkite/test-amd.yaml
View file @
7e63ef82
...
...
@@ -128,7 +128,7 @@ steps:
-
tests/entrypoints/
commands
:
-
pytest -v -s entrypoints/openai/tool_parsers
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai
--ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator
--ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
label
:
Entrypoints Integration Test (LLM)
# 30min
timeout_in_minutes
:
40
...
...
@@ -148,7 +148,7 @@ steps:
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
label
:
Entrypoints Integration Test (API Server)
# 100min
-
label
:
Entrypoints Integration Test (API Server
1
)
# 100min
timeout_in_minutes
:
130
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
...
...
@@ -162,10 +162,28 @@ steps:
-
tests/entrypoints/test_chat_utils
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
# PYTHONPATH is needed to import custom Worker extension
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
-
pytest -v -s entrypoints/test_chat_utils.py
-
label
:
Entrypoints Integration Test (API Server 2)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
# grade: Blocking
working_dir
:
"
/vllm-workspace/tests"
fast_check
:
true
torch_nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/sleep
-
tests/entrypoints/rpc
-
tests/tool_use
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/sleep
-
pytest -v -s tool_use
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
-
label
:
Entrypoints Integration Test (Pooling)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
...
...
@@ -181,6 +199,21 @@ steps:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/pooling
-
label
:
Entrypoints Integration Test (Responses API)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
# grade: Blocking
working_dir
:
"
/vllm-workspace/tests"
fast_check
:
true
torch_nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/openai/responses
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/openai/responses
-
label
:
Distributed Tests (4 GPUs)
# 35min
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
...
...
@@ -201,6 +234,9 @@ steps:
-
tests/v1/engine/test_engine_core_client.py
-
tests/distributed/test_symm_mem_allreduce.py
commands
:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
-
export TORCH_NCCL_BLOCKING_WAIT=1
# test with torchrun tp=2 and external_dp=2
-
torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
...
...
@@ -249,9 +285,10 @@ steps:
-
vllm/v1/executor/uniproc_executor.py
-
vllm/v1/worker/gpu_worker.py
commands
:
# https://github.com/NVIDIA/nccl/issues/1838
#- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
-
export TORCH_NCCL_BLOCKING_WAIT=1
-
torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-
label
:
EPLB Algorithm Test
# 5min
...
...
@@ -331,7 +368,9 @@ steps:
-
label
:
V1 Test e2e + engine
# 65min
timeout_in_minutes
:
90
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_4
# The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
agent_pool
:
mi325_8
# grade: Blocking
source_file_dependencies
:
-
vllm/
...
...
@@ -492,8 +531,7 @@ steps:
-
tests/samplers
-
tests/conftest.py
commands
:
-
pytest -v -s samplers
-
VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-
pytest -v -s -m 'not skip_v1' samplers
-
label
:
LoRA Test %N
# 20min each
timeout_in_minutes
:
30
...
...
@@ -707,7 +745,7 @@ steps:
-
label
:
Quantization Test
# 70min
timeout_in_minutes
:
90
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
source_file_dependencies
:
...
...
@@ -722,7 +760,7 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
-
uv pip install --system torchao==0.1
3.0
-
uv pip install --system torchao==0.1
4.1
-
uv pip install --system conch-triton-kernels
-
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
...
...
@@ -736,7 +774,7 @@ steps:
-
vllm/model_executor/layers/quantization
autorun_on_main
:
true
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
label
:
OpenAI API correctness
# 10min
timeout_in_minutes
:
15
...
...
@@ -747,21 +785,11 @@ steps:
-
csrc/
-
vllm/entrypoints/openai/
-
vllm/model_executor/models/whisper.py
-
tools/
commands
:
# LMEval+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
-
bash ../tools/install_torchcodec_rocm.sh || exit
1
-
pytest -s entrypoints/openai/correctness/
-
label
:
OpenAI-Compatible Tool Use
# 23 min
timeout_in_minutes
:
35
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
fast_check
:
false
source_file_dependencies
:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s tool_use
##### models test #####
...
...
@@ -854,6 +882,7 @@ steps:
# Shard slow subset of standard language models tests. Only run when model
# source is modified, or when specified test files are modified
-
pip freeze | grep -E 'torch'
-
export TORCH_NCCL_BLOCKING_WAIT=1
-
pytest -v -s models/language -m 'core_model and slow_test' \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--shard-id=$$BUILDKITE_PARALLEL_JOB
...
...
@@ -871,7 +900,7 @@ steps:
commands
:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
-
uv pip install --system --no-build-isolation 'git+https://github.com/
state-spaces/mamba@v2.2.5
'
-
uv pip install --system --no-build-isolation 'git+https://github.com/
AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr
'
-
uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests
-
pytest -v -s models/language/generation \
...
...
@@ -892,7 +921,7 @@ steps:
commands
:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
-
uv pip install --system --no-build-isolation 'git+https://github.com/
state-spaces/mamba@v2.2.5
'
-
uv pip install --system --no-build-isolation 'git+https://github.com/
AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr
'
-
uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-
pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
...
...
@@ -957,7 +986,7 @@ steps:
-
pytest -v -s models/multimodal/processing
-
label
:
Multi-Modal Models Test (Standard)
# 60min
timeout_in_minutes
:
8
0
timeout_in_minutes
:
10
0
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
# grade: Blocking
...
...
@@ -966,13 +995,16 @@ steps:
-
vllm/
-
tests/models/multimodal
commands
:
-
export MIOPEN_DEBUG_CONV_DIRECT=0
-
export MIOPEN_DEBUG_CONV_GEMM=0
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pip freeze | grep -E 'torch'
-
pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
-
pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
-
pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
-
cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model
# Otherwise, mp_method="spawn" doesn't work
-
label
:
Multi-Modal Accuracy Eval (Small Models)
#
150min - 180
min
timeout_in_minutes
:
1
8
0
-
label
:
Multi-Modal Accuracy Eval (Small Models)
#
5
min
timeout_in_minutes
:
10
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
...
...
@@ -982,7 +1014,9 @@ steps:
-
vllm/inputs/
-
vllm/v1/core/
commands
:
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
export MIOPEN_DEBUG_CONV_DIRECT=0
-
export MIOPEN_DEBUG_CONV_GEMM=0
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
-
label
:
Multi-Modal Models Test (Extended)
1
# 60min
timeout_in_minutes
:
120
...
...
@@ -994,10 +1028,13 @@ steps:
-
vllm/
-
tests/models/multimodal
commands
:
-
export MIOPEN_DEBUG_CONV_DIRECT=0
-
export MIOPEN_DEBUG_CONV_GEMM=0
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
-
label
:
Multi-Modal Models Test (Extended)
2
-
label
:
Multi-Modal Models Test (Extended)
2
#60min
timeout_in_minutes
:
120
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
# grade: Blocking
...
...
@@ -1006,6 +1043,8 @@ steps:
-
vllm/
-
tests/models/multimodal
commands
:
-
export MIOPEN_DEBUG_CONV_DIRECT=0
-
export MIOPEN_DEBUG_CONV_GEMM=0
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
...
...
@@ -1019,6 +1058,8 @@ steps:
-
vllm/
-
tests/models/multimodal
commands
:
-
export MIOPEN_DEBUG_CONV_DIRECT=0
-
export MIOPEN_DEBUG_CONV_GEMM=0
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
...
...
@@ -1078,8 +1119,8 @@ steps:
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
-
vllm/v1/attention/backends/mla/flashinfer_mla.py
-
vllm/v1/attention/selector.py
-
vllm/platforms/cuda.py
-
vllm/attention/selector.py
commands
:
-
nvidia-smi
-
python3 examples/offline_inference/basic/chat.py
...
...
@@ -1196,7 +1237,7 @@ steps:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test #####
##### multi gpus test #####
...
...
@@ -1236,13 +1277,13 @@ steps:
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size=1 --node-size
=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp
=1 --dp-num-nodes
=2 --
dp-
node-rank=0 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size=1 --node-size
=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp
=1 --dp-num-nodes
=2 --
dp-
node-rank=1 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code
-
label
:
Distributed Tests (2 GPUs)
# 68min
timeout_in_minutes
:
90
...
...
@@ -1268,6 +1309,9 @@ steps:
-
tests/v1/shutdown
-
tests/v1/worker/test_worker_memory_snapshot.py
commands
:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
-
export TORCH_NCCL_BLOCKING_WAIT=1
-
TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-
TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-
TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
...
...
@@ -1417,8 +1461,22 @@ steps:
-
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-
tests/v1/kv_connector/nixl_integration/
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-
bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-
VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
label
:
DP EP NixlConnector PD accuracy tests (Distributed)
# 15min
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_4
# grade: Blocking
timeout_in_minutes
:
15
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
source_file_dependencies
:
-
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-
tests/v1/kv_connector/nixl_integration/
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-
VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
##### multi gpus test #####
##### A100 test #####
...
...
@@ -1490,7 +1548,7 @@ steps:
-
"
VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest
-v
-s
tests/compile/distributed/test_fusions_e2e.py
-k
'not
Llama-4'"
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
HIP_VISIBLE_DEVICES=0,1
VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1
VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
Qwen/Qwen1.5-MoE-A2.7B
-
-tp
-size
=1
-
-dp
-size
=2 --max-model-len
2048
-
HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
=
Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len
=
2048
--all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
-
pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
...
...
@@ -1514,7 +1572,7 @@ steps:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
label
:
LM Eval Large Models (4 Card)
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
...
...
@@ -1569,6 +1627,8 @@ steps:
-
.buildkite/scripts/run-prime-rl-test.sh
commands
:
-
bash .buildkite/scripts/run-prime-rl-test.sh
##### EPLB Accuracy Tests #####
-
label
:
DeepSeek V2-Lite Accuracy
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_4
...
...
.buildkite/test-pipeline.yaml
View file @
7e63ef82
...
...
@@ -114,7 +114,7 @@ steps:
-
tests/entrypoints/
commands
:
-
pytest -v -s entrypoints/openai/tool_parsers
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm
--ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator
--ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
label
:
Entrypoints Integration Test (LLM)
# 30min
timeout_in_minutes
:
40
...
...
@@ -132,7 +132,7 @@ steps:
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
label
:
Entrypoints Integration Test (API Server)
# 100min
-
label
:
Entrypoints Integration Test (API Server
1
)
# 100min
timeout_in_minutes
:
130
mirror_hardwares
:
[
amdexperimental
]
working_dir
:
"
/vllm-workspace/tests"
...
...
@@ -144,10 +144,26 @@ steps:
-
tests/entrypoints/test_chat_utils
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
# PYTHONPATH is needed to import custom Worker extension
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
-
pytest -v -s entrypoints/test_chat_utils.py
-
label
:
Entrypoints Integration Test (API Server 2)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
working_dir
:
"
/vllm-workspace/tests"
fast_check
:
true
torch_nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/sleep
-
tests/entrypoints/rpc
-
tests/tool_use
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/sleep
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
-
pytest -v -s tool_use
-
label
:
Entrypoints Integration Test (Pooling)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
...
...
@@ -161,6 +177,18 @@ steps:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/pooling
-
label
:
Entrypoints Integration Test (Responses API)
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
working_dir
:
"
/vllm-workspace/tests"
fast_check
:
true
torch_nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/openai/responses
commands
:
-
pytest -v -s entrypoints/openai/responses
-
label
:
Distributed Tests (4 GPUs)
# 35min
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
...
...
@@ -303,7 +331,10 @@ steps:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-
pytest -v -s v1/e2e
-
pytest -v -s v1/engine
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
-
pytest -v -s v1/engine/test_preprocess_error_handling.py
-
pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-
label
:
V1 Test entrypoints
# 35min
timeout_in_minutes
:
50
...
...
@@ -642,7 +673,7 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
-
uv pip install --system torchao==0.1
3.0
--index-url https://download.pytorch.org/whl/cu129
-
uv pip install --system torchao==0.1
4.1
--index-url https://download.pytorch.org/whl/cu129
-
uv pip install --system conch-triton-kernels
-
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
...
...
@@ -654,7 +685,7 @@ steps:
-
vllm/model_executor/layers/quantization
autorun_on_main
:
true
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
label
:
OpenAI API correctness
# 22min
timeout_in_minutes
:
30
...
...
@@ -666,16 +697,6 @@ steps:
commands
:
# LMEval+Transcription WER check
-
pytest -s entrypoints/openai/correctness/
-
label
:
OpenAI-Compatible Tool Use
# 23 min
timeout_in_minutes
:
35
mirror_hardwares
:
[
amdexperimental
]
fast_check
:
false
source_file_dependencies
:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s tool_use
##### models test #####
-
label
:
Basic Models Tests (Initialization)
...
...
@@ -934,7 +955,6 @@ steps:
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/"
gpu
:
b200
# optional: true
source_file_dependencies
:
-
csrc/quantization/fp4/
-
csrc/attention/mla/
...
...
@@ -946,8 +966,8 @@ steps:
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
-
vllm/v1/attention/backends/mla/flashinfer_mla.py
-
vllm/v1/attention/selector.py
-
vllm/platforms/cuda.py
-
vllm/attention/selector.py
commands
:
-
nvidia-smi
-
python3 examples/offline_inference/basic/chat.py
...
...
@@ -1064,7 +1084,7 @@ steps:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test #####
##### multi gpus test #####
...
...
@@ -1096,17 +1116,18 @@ steps:
-
vllm/model_executor/models/
-
tests/distributed/
-
tests/examples/offline_inference/data_parallel.py
-
.buildkite/scripts/run-multi-node-test.sh
commands
:
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size=1 --node-size
=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp
=1 --dp-num-nodes
=2 --
dp-
node-rank=0 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-
NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-
python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size=1 --node-size
=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-
python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp
=1 --dp-num-nodes
=2 --
dp-
node-rank=1 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code
-
label
:
Distributed Tests (2 GPUs)
# 68min
timeout_in_minutes
:
90
...
...
@@ -1258,8 +1279,8 @@ steps:
commands
:
-
bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-
label
:
NixlConnector PD accuracy tests (Distributed)
#
3
0min
timeout_in_minutes
:
3
0
-
label
:
NixlConnector PD accuracy tests (Distributed)
#
4
0min
timeout_in_minutes
:
4
0
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
source_file_dependencies
:
...
...
@@ -1267,7 +1288,18 @@ steps:
-
tests/v1/kv_connector/nixl_integration/
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-
bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
-
bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
label
:
DP EP NixlConnector PD accuracy tests (Distributed)
# 15min
timeout_in_minutes
:
15
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
source_file_dependencies
:
-
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-
tests/v1/kv_connector/nixl_integration/
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-
DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
##### multi gpus test #####
...
...
@@ -1325,9 +1357,17 @@ steps:
-
"
VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest
-v
-s
tests/compile/distributed/test_fusions_e2e.py
-k
'not
Llama-4'"
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
CUDA_VISIBLE_DEVICES=1,2
VLLM_ALL2ALL_BACKEND=deepep_high_throughput
VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
Qwen/Qwen1.5-MoE-A2.7B
-
-tp
-size
=1
-
-dp
-size
=2 --max-model-len
2048
-
CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
=
Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len
=
2048
--all2all-backend=deepep_high_throughput
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
label
:
LM Eval Large Models (H200)
# optional
timeout_in_minutes
:
60
gpu
:
h200
optional
:
true
num_gpus
:
8
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
##### B200 test #####
-
label
:
Distributed Tests (B200)
# optional
gpu
:
b200
...
...
@@ -1350,6 +1390,7 @@ steps:
-
vllm/
-
.buildkite/scripts/run-prime-rl-test.sh
commands
:
-
nvidia-smi
-
bash .buildkite/scripts/run-prime-rl-test.sh
-
label
:
DeepSeek V2-Lite Accuracy
...
...
@@ -1378,3 +1419,26 @@ steps:
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2
1
##### MoE Refactor (Temporary) Tests #####
-
label
:
MoE Refactor Integration Test (H100 - TEMPORARY)
# optional
gpu
:
h100
optional
:
true
num_gpus
:
2
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
-
label
:
MoE Refactor Integration Test (B200 - TEMPORARY)
# optional
gpu
:
b200
optional
:
true
num_gpus
:
2
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
-
label
:
MoE Refactor Integration Test (B200 DP - TEMPORARY)
# optional
gpu
:
b200
optional
:
true
num_gpus
:
2
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
.buildkite/test_areas/distributed.yaml
View file @
7e63ef82
...
...
@@ -145,7 +145,7 @@ steps:
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-
VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
CUDA_VISIBLE_DEVICES=1,2
VLLM_ALL2ALL_BACKEND=deepep_high_throughput
VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
Qwen/Qwen1.5-MoE-A2.7B
-
-tp
-size
=1
-
-dp
-size
=2 --max-model-len
2048
-
CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model
=
Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len
=
2048
--all2all-backend=deepep_high_throughput
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
label
:
Distributed Tests (2 GPUs)(B200)
...
...
@@ -171,7 +171,7 @@ steps:
-
tests/distributed/
-
tests/examples/offline_inference/data_parallel.py
commands
:
-
./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size=1 --node-size
=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py
-
-dp
-size
=2
-
-tp
-size=1 --node-size
=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
-
./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp
=1 --dp-num-nodes
=2 --
dp-
node-rank=0 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp
=1 --dp-num-nodes
=2 --
dp-
node-rank=1 --
dp-
master-addr=192.168.10.10 --
dp-
master-port=12345 --enforce-eager --trust-remote-code"
-
label
:
Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes
:
30
...
...
@@ -182,7 +182,7 @@ steps:
-
tests/v1/kv_connector/nixl_integration/
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-
bash v1/kv_connector/nixl_integration/
tp_
config_sweep_accuracy_test.sh
-
bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
label
:
Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes
:
60
...
...
.buildkite/test_areas/e2e_integration.yaml
View file @
7e63ef82
...
...
@@ -32,6 +32,7 @@ steps:
-
label
:
Prime-RL Integration (2 GPUs)
timeout_in_minutes
:
30
optional
:
true
soft_fail
:
true
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
source_file_dependencies
:
...
...
@@ -39,21 +40,3 @@ steps:
-
.buildkite/scripts/run-prime-rl-test.sh
commands
:
-
bash .buildkite/scripts/run-prime-rl-test.sh
-
label
:
DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319
8030
-
label
:
Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319
8040
.buildkite/test_areas/entrypoints.yaml
View file @
7e63ef82
...
...
@@ -10,7 +10,7 @@ steps:
-
tests/entrypoints/
commands
:
-
pytest -v -s entrypoints/openai/tool_parsers
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
pytest -v -s entrypoints/ --ignore=entrypoints/llm
--ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator
--ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-
label
:
Entrypoints Integration (LLM)
timeout_in_minutes
:
40
...
...
@@ -25,7 +25,7 @@ steps:
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
label
:
Entrypoints Integration (API Server)
-
label
:
Entrypoints Integration (API Server
1
)
timeout_in_minutes
:
130
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
...
...
@@ -34,10 +34,24 @@ steps:
-
tests/entrypoints/test_chat_utils
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py
# PYTHONPATH is needed to import custom Worker extension
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
-
pytest -v -s entrypoints/test_chat_utils.py
-
label
:
Entrypoints Integration (API Server 2)
timeout_in_minutes
:
130
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
vllm/
-
tests/tool_use
-
tests/entrypoints/sleep
-
tests/entrypoints/instrumentator
-
tests/entrypoints/rpc
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
-
pytest -v -s entrypoints/instrumentator
-
pytest -v -s entrypoints/sleep
-
pytest -v -s tool_use
-
label
:
Entrypoints Integration (Pooling)
timeout_in_minutes
:
50
...
...
@@ -49,6 +63,14 @@ steps:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s entrypoints/pooling
-
label
:
Entrypoints Integration (Responses API)
timeout_in_minutes
:
50
working_dir
:
"
/vllm-workspace/tests"
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/openai/responses
commands
:
-
pytest -v -s entrypoints/openai/responses
-
label
:
Entrypoints V1
timeout_in_minutes
:
50
...
...
.buildkite/test_areas/kernels.yaml
View file @
7e63ef82
...
...
@@ -90,8 +90,8 @@ steps:
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/mla/cutlass_mla.py
-
vllm/v1/attention/backends/mla/flashinfer_mla.py
-
vllm/v1/attention/selector.py
-
vllm/platforms/cuda.py
-
vllm/attention/selector.py
commands
:
-
nvidia-smi
-
python3 examples/offline_inference/basic/chat.py
...
...
.buildkite/test_areas/lm_eval.yaml
View file @
7e63ef82
...
...
@@ -9,7 +9,7 @@ steps:
-
vllm/model_executor/layers/quantization
autorun_on_main
:
true
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
label
:
LM Eval Large Models (4 GPUs)(A100)
gpu
:
a100
...
...
@@ -43,4 +43,4 @@ steps:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
--tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
.buildkite/test_areas/lora.yaml
View file @
7e63ef82
...
...
@@ -22,6 +22,8 @@ steps:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
# Alot of these tests are on the edge of OOMing
-
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
-
pytest -v -s -x lora/test_chatglm3_tp.py
...
...
.buildkite/test_areas/models_basic.yaml
View file @
7e63ef82
...
...
@@ -9,6 +9,7 @@ steps:
source_file_dependencies
:
-
vllm/
-
tests/models/test_initialization.py
-
tests/models/registry.py
commands
:
# Run a subset of model initialization tests
-
pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
...
...
@@ -20,6 +21,7 @@ steps:
source_file_dependencies
:
-
vllm/model_executor/models/
-
tests/models/test_initialization.py
-
tests/models/registry.py
commands
:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
...
...
.buildkite/test_areas/pytorch.yaml
View file @
7e63ef82
...
...
@@ -13,7 +13,9 @@ steps:
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-
"
find
compile/
-maxdepth
1
-name
'test_*.py'
-exec
pytest
-s
-v
{}
\\
;"
# However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
-
"
find
compile/
-maxdepth
1
-name
'test_*.py'
-print0
|
xargs
-0
-n1
-I{}
pytest
-s
-v
'{}'"
-
label
:
PyTorch Fullgraph Smoke Test
timeout_in_minutes
:
30
...
...
.buildkite/test_areas/tool_use.yaml
deleted
100644 → 0
View file @
8cbcac5d
group
:
Tool use
depends_on
:
-
image-build
steps
:
-
label
:
OpenAI-Compatible Tool Use
timeout_in_minutes
:
35
mirror_hardwares
:
[
amdexperimental
]
fast_check
:
false
source_file_dependencies
:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s tool_use
.github/CODEOWNERS
View file @
7e63ef82
...
...
@@ -3,7 +3,6 @@
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
...
...
@@ -15,6 +14,7 @@
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
...
...
@@ -26,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# vLLM V1
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep
...
...
@@ -116,15 +117,15 @@ mkdocs.yaml @hmellor
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/attention/ops/triton_unified_attention.py @tdoublep
/vllm/
v1/
attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/
v1/
attention/ops/triton_unified_attention.py @tdoublep
# ROCm related: specify owner with write access to notify AMD folks for careful code review
/vllm/**/*rocm* @tjtanaa
/docker/Dockerfile.rocm* @gshtras @tjtanaa
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
/vllm/
v1/
attention/ops/rocm*.py @gshtras @tjtanaa
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
/csrc/rocm @gshtras @tjtanaa
/requirements/*rocm* @tjtanaa
...
...
@@ -152,7 +153,7 @@ mkdocs.yaml @hmellor
/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler
.py
@noooop
/vllm/model_executor/layers/pooler @noooop
# Security guide and policies
/docs/usage/security.md @russellb
...
...
Prev
1
2
3
4
5
6
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment