Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8d75f22e
Commit
8d75f22e
authored
Dec 13, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori
parents
ce888aa4
7d80c73d
Changes
706
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
618 additions
and
112 deletions
+618
-112
.buildkite/ci_config.yaml
.buildkite/ci_config.yaml
+24
-0
.buildkite/generate_index.py
.buildkite/generate_index.py
+0
-46
.buildkite/image_build/image_build.sh
.buildkite/image_build/image_build.sh
+56
-0
.buildkite/image_build/image_build.yaml
.buildkite/image_build/image_build.yaml
+57
-0
.buildkite/image_build/image_build_cpu.sh
.buildkite/image_build/image_build_cpu.sh
+36
-0
.buildkite/image_build/image_build_cpu_arm64.sh
.buildkite/image_build/image_build_cpu_arm64.sh
+33
-0
.buildkite/image_build/image_build_hpu.sh
.buildkite/image_build/image_build_hpu.sh
+34
-0
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
.../configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+1
-0
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
+1
-0
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+55
-20
.buildkite/scripts/generate-nightly-index.py
.buildkite/scripts/generate-nightly-index.py
+28
-8
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+2
-1
.buildkite/scripts/hardware_ci/run-npu-test.sh
.buildkite/scripts/hardware_ci/run-npu-test.sh
+1
-0
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
+2
-1
.buildkite/scripts/run-prime-rl-test.sh
.buildkite/scripts/run-prime-rl-test.sh
+5
-0
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
...eduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
+73
-0
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
...ts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+1
-0
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
...s/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+74
-0
.buildkite/scripts/upload-wheels.sh
.buildkite/scripts/upload-wheels.sh
+4
-1
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+131
-35
No files found.
.buildkite/ci_config.yaml
0 → 100644
View file @
8d75f22e
name
:
vllm_ci
job_dirs
:
-
"
.buildkite/test_areas"
-
"
.buildkite/image_build"
run_all_patterns
:
-
"
docker/Dockerfile"
-
"
CMakeLists.txt"
-
"
requirements/common.txt"
-
"
requirements/cuda.txt"
-
"
requirements/build.txt"
-
"
requirements/test.txt"
-
"
setup.py"
-
"
csrc/"
-
"
cmake/"
run_all_exclude_patterns
:
-
"
docker/Dockerfile."
-
"
csrc/cpu/"
-
"
csrc/rocm/"
-
"
cmake/hipify.py"
-
"
cmake/cpu_extension.cmake"
registries
:
public.ecr.aws/q9t5s3a7
repositories
:
main
:
"
vllm-ci-postmerge-repo"
premerge
:
"
vllm-ci-test-repo"
.buildkite/generate_index.py
deleted
100644 → 0
View file @
ce888aa4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
os
template
=
"""<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
</body>
</html>
"""
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--wheel"
,
help
=
"The wheel path."
,
required
=
True
)
args
=
parser
.
parse_args
()
filename
=
os
.
path
.
basename
(
args
.
wheel
)
with
open
(
"index.html"
,
"w"
)
as
f
:
print
(
f
"Generated index.html for
{
args
.
wheel
}
"
)
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
if
"x86_64"
in
filename
:
x86_wheel
=
filename
arm_wheel
=
filename
.
replace
(
"x86_64"
,
"aarch64"
).
replace
(
"manylinux1"
,
"manylinux2014"
)
elif
"aarch64"
in
filename
:
x86_wheel
=
filename
.
replace
(
"aarch64"
,
"x86_64"
).
replace
(
"manylinux2014"
,
"manylinux1"
)
arm_wheel
=
filename
else
:
raise
ValueError
(
f
"Unsupported wheel:
{
filename
}
"
)
# cloudfront requires escaping the '+' character
f
.
write
(
template
.
format
(
x86_wheel
=
x86_wheel
,
x86_wheel_html_escaped
=
x86_wheel
.
replace
(
"+"
,
"%2B"
),
arm_wheel
=
arm_wheel
,
arm_wheel_html_escaped
=
arm_wheel
.
replace
(
"+"
,
"%2B"
),
)
)
.buildkite/image_build/image_build.sh
0 → 100755
View file @
8d75f22e
#!/bin/bash
set
-e
if
[[
$#
-lt
8
]]
;
then
echo
"Usage:
$0
<registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit
1
fi
REGISTRY
=
$1
REPO
=
$2
BUILDKITE_COMMIT
=
$3
BRANCH
=
$4
VLLM_USE_PRECOMPILED
=
$5
VLLM_MERGE_BASE_COMMIT
=
$6
CACHE_FROM
=
$7
CACHE_TO
=
$8
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
aws ecr get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
936637512419.dkr.ecr.us-east-1.amazonaws.com
# docker buildx
docker buildx create
--name
vllm-builder
--driver
docker-container
--use
docker buildx inspect
--bootstrap
docker buildx
ls
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
)
]]
;
then
echo
"Image not found, proceeding with build..."
else
echo
"Image found"
exit
0
fi
if
[[
"
${
VLLM_USE_PRECOMPILED
:-
0
}
"
==
"1"
]]
;
then
merge_base_commit_build_args
=
"--build-arg VLLM_MERGE_BASE_COMMIT=
${
VLLM_MERGE_BASE_COMMIT
}
"
else
merge_base_commit_build_args
=
""
fi
# build
docker buildx build
--file
docker/Dockerfile
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
USE_SCCACHE
=
1
\
--build-arg
TORCH_CUDA_ARCH_LIST
=
"8.0 8.9 9.0 10.0"
\
--build-arg
FI_TORCH_CUDA_ARCH_LIST
=
"8.0 8.9 9.0a 10.0a"
\
--build-arg
VLLM_USE_PRECOMPILED
=
"
${
VLLM_USE_PRECOMPILED
:-
0
}
"
\
${
merge_base_commit_build_args
}
\
--cache-from
type
=
registry,ref
=
${
CACHE_FROM
}
,mode
=
max
\
--cache-to
type
=
registry,ref
=
${
CACHE_TO
}
,mode
=
max
\
--tag
${
REGISTRY
}
/
${
REPO
}
:
${
BUILDKITE_COMMIT
}
\
$(
[[
"
${
BRANCH
}
"
==
"main"
]]
&&
echo
"--tag
${
REGISTRY
}
/
${
REPO
}
:latest"
)
\
--push
\
--target
test
\
--progress
plain
.
.buildkite/image_build/image_build.yaml
0 → 100644
View file @
8d75f22e
group
:
Abuild
steps
:
-
label
:
"
:docker:
Build
image"
key
:
image-build
depends_on
:
[]
commands
:
-
.buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
retry
:
automatic
:
-
exit_status
:
-1
# Agent was lost
limit
:
2
-
exit_status
:
-10
# Agent was lost
limit
:
2
-
label
:
"
:docker:
Build
CPU
image"
key
:
image-build-cpu
depends_on
:
[]
commands
:
-
.buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env
:
DOCKER_BUILDKIT
:
"
1"
retry
:
automatic
:
-
exit_status
:
-1
# Agent was lost
limit
:
2
-
exit_status
:
-10
# Agent was lost
limit
:
2
-
label
:
"
:docker:
Build
HPU
image"
soft_fail
:
true
depends_on
:
[]
key
:
image-build-hpu
commands
:
-
.buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env
:
DOCKER_BUILDKIT
:
"
1"
retry
:
automatic
:
-
exit_status
:
-1
# Agent was lost
limit
:
2
-
exit_status
:
-10
# Agent was lost
limit
:
2
-
label
:
"
:docker:
Build
CPU
arm64
image"
key
:
cpu-arm64-image-build
depends_on
:
[]
optional
:
true
commands
:
-
.buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env
:
DOCKER_BUILDKIT
:
"
1"
retry
:
automatic
:
-
exit_status
:
-1
# Agent was lost
limit
:
2
-
exit_status
:
-10
# Agent was lost
limit
:
2
.buildkite/image_build/image_build_cpu.sh
0 → 100755
View file @
8d75f22e
#!/bin/bash
set
-e
if
[[
$#
-lt
3
]]
;
then
echo
"Usage:
$0
<registry> <repo> <commit>"
exit
1
fi
REGISTRY
=
$1
REPO
=
$2
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
else
echo
"Image found"
exit
0
fi
# build
docker build
--file
docker/Dockerfile.cpu
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--build-arg
VLLM_CPU_AVX512BF16
=
true
\
--build-arg
VLLM_CPU_AVX512VNNI
=
true
\
--build-arg
VLLM_CPU_AMXBF16
=
true
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
\
--target
vllm-test
\
--progress
plain
.
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
.buildkite/image_build/image_build_cpu_arm64.sh
0 → 100755
View file @
8d75f22e
#!/bin/bash
set
-e
if
[[
$#
-lt
3
]]
;
then
echo
"Usage:
$0
<registry> <repo> <commit>"
exit
1
fi
REGISTRY
=
$1
REPO
=
$2
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
else
echo
"Image found"
exit
0
fi
# build
docker build
--file
docker/Dockerfile.cpu
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
\
--target
vllm-test
\
--progress
plain
.
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-cpu
.buildkite/image_build/image_build_hpu.sh
0 → 100755
View file @
8d75f22e
#!/bin/bash
set
-e
if
[[
$#
-lt
3
]]
;
then
echo
"Usage:
$0
<registry> <repo> <commit>"
exit
1
fi
REGISTRY
=
$1
REPO
=
$2
BUILDKITE_COMMIT
=
$3
# authenticate with AWS ECR
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
$REGISTRY
# skip build if image already exists
if
[[
-z
$(
docker manifest inspect
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
)
]]
;
then
echo
"Image not found, proceeding with build..."
else
echo
"Image found"
exit
0
fi
# build
docker build
\
--file
tests/pytorch_ci_hud_benchmark/Dockerfile.hpu
\
--build-arg
max_jobs
=
16
\
--build-arg
buildkite_commit
=
$BUILDKITE_COMMIT
\
--tag
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
\
--progress
plain
\
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push
$REGISTRY
/
$REPO
:
$BUILDKITE_COMMIT
-hpu
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
View file @
8d75f22e
...
@@ -8,3 +8,4 @@ tasks:
...
@@ -8,3 +8,4 @@ tasks:
value
:
0.80
value
:
0.80
limit
:
250
# will run on 250 * 14 subjects = 3500 samples
limit
:
250
# will run on 250 * 14 subjects = 3500 samples
num_fewshot
:
5
num_fewshot
:
5
rtol
:
0.05
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
0 → 100644
View file @
8d75f22e
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
View file @
8d75f22e
...
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
...
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
--tp-size=1
--tp-size=1
"""
"""
import
os
from
contextlib
import
contextmanager
import
lm_eval
import
lm_eval
import
numpy
as
np
import
numpy
as
np
import
yaml
import
yaml
RTOL
=
0.08
DEFAULT_RTOL
=
0.08
@
contextmanager
def
scoped_env_vars
(
new_env
:
dict
[
str
,
str
]):
if
not
new_env
:
# Fast path: nothing to do
yield
return
old_values
=
{}
new_keys
=
[]
try
:
for
key
,
value
in
new_env
.
items
():
if
key
in
os
.
environ
:
old_values
[
key
]
=
os
.
environ
[
key
]
else
:
new_keys
.
append
(
key
)
os
.
environ
[
key
]
=
str
(
value
)
yield
finally
:
# Restore / clean up
for
key
,
value
in
old_values
.
items
():
os
.
environ
[
key
]
=
value
for
key
in
new_keys
:
os
.
environ
.
pop
(
key
,
None
)
def
launch_lm_eval
(
eval_config
,
tp_size
):
def
launch_lm_eval
(
eval_config
,
tp_size
):
...
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
...
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
f
"trust_remote_code=
{
trust_remote_code
}
,"
f
"trust_remote_code=
{
trust_remote_code
}
,"
f
"max_model_len=
{
max_model_len
}
,"
f
"max_model_len=
{
max_model_len
}
,"
)
)
results
=
lm_eval
.
simple_evaluate
(
model
=
backend
,
env_vars
=
eval_config
.
get
(
"env_vars"
,
None
)
model_args
=
model_args
,
with
scoped_env_vars
(
env_vars
):
tasks
=
[
task
[
"name"
]
for
task
in
eval_config
[
"tasks"
]],
results
=
lm_eval
.
simple_evaluate
(
num_fewshot
=
eval_config
[
"num_fewshot"
],
model
=
backend
,
limit
=
eval_config
[
"limit"
],
model_args
=
model_args
,
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
tasks
=
[
task
[
"name"
]
for
task
in
eval_config
[
"tasks"
]],
# text models. however, this is regressing measured strict-match for
num_fewshot
=
eval_config
[
"num_fewshot"
],
# existing text models in CI, so only apply it for mm, or explicitly set
limit
=
eval_config
[
"limit"
],
apply_chat_template
=
eval_config
.
get
(
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
"apply_chat_template"
,
backend
==
"vllm-vlm"
# text models. however, this is regressing measured strict-match for
),
# existing text models in CI, so only apply it for mm, or explicitly set
fewshot_as_multiturn
=
eval_config
.
get
(
"fewshot_as_multiturn"
,
False
),
apply_chat_template
=
eval_config
.
get
(
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
"apply_chat_template"
,
backend
==
"vllm-vlm"
gen_kwargs
=
eval_config
.
get
(
"gen_kwargs"
),
),
batch_size
=
batch_size
,
fewshot_as_multiturn
=
eval_config
.
get
(
"fewshot_as_multiturn"
,
False
),
)
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs
=
eval_config
.
get
(
"gen_kwargs"
),
batch_size
=
batch_size
,
)
return
results
return
results
...
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
...
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results
=
launch_lm_eval
(
eval_config
,
tp_size
)
results
=
launch_lm_eval
(
eval_config
,
tp_size
)
rtol
=
eval_config
.
get
(
"rtol"
,
DEFAULT_RTOL
)
success
=
True
success
=
True
for
task
in
eval_config
[
"tasks"
]:
for
task
in
eval_config
[
"tasks"
]:
for
metric
in
task
[
"metrics"
]:
for
metric
in
task
[
"metrics"
]:
...
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
...
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value
=
results
[
"results"
][
task
[
"name"
]][
metric
[
"name"
]]
measured_value
=
results
[
"results"
][
task
[
"name"
]][
metric
[
"name"
]]
print
(
print
(
f
"
{
task
[
'name'
]
}
|
{
metric
[
'name'
]
}
: "
f
"
{
task
[
'name'
]
}
|
{
metric
[
'name'
]
}
: "
f
"ground_truth=
{
ground_truth
}
| measured=
{
measured_value
}
"
f
"ground_truth=
{
ground_truth
:.
3
f
}
| "
f
"measured=
{
measured_value
:.
3
f
}
| rtol=
{
rtol
}
"
)
)
success
=
success
and
np
.
isclose
(
ground_truth
,
measured_value
,
rtol
=
RTOL
)
success
=
success
and
np
.
isclose
(
ground_truth
,
measured_value
,
rtol
=
rtol
)
assert
success
assert
success
.buildkite/scripts/generate-nightly-index.py
View file @
8d75f22e
...
@@ -7,18 +7,21 @@
...
@@ -7,18 +7,21 @@
import
argparse
import
argparse
import
json
import
json
import
re
import
sys
import
sys
from
dataclasses
import
asdict
,
dataclass
from
dataclasses
import
asdict
,
dataclass
from
datetime
import
datetime
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Any
from
typing
import
Any
from
urllib.parse
import
quote
from
urllib.parse
import
quote
import
regex
as
re
if
not
sys
.
version_info
>=
(
3
,
12
):
if
not
sys
.
version_info
>=
(
3
,
12
):
raise
RuntimeError
(
"This script requires Python 3.12 or higher."
)
raise
RuntimeError
(
"This script requires Python 3.12 or higher."
)
INDEX_HTML_TEMPLATE
=
"""<!DOCTYPE html>
INDEX_HTML_TEMPLATE
=
"""<!DOCTYPE html>
<html>
<html>
<!-- {comment} -->
<meta name="pypi:repository-version" content="1.0">
<meta name="pypi:repository-version" content="1.0">
<body>
<body>
{items}
{items}
...
@@ -89,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
...
@@ -89,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
)
)
def
generate_project_list
(
subdir_names
:
list
[
str
])
->
str
:
def
generate_project_list
(
subdir_names
:
list
[
str
]
,
comment
:
str
=
""
)
->
str
:
"""
"""
Generate project list HTML content linking to each project & variant sub-directory.
Generate project list HTML content linking to each project & variant sub-directory.
"""
"""
...
@@ -97,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
...
@@ -97,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
for
name
in
sorted
(
subdir_names
):
for
name
in
sorted
(
subdir_names
):
name
=
name
.
strip
(
"/"
).
strip
(
"."
)
name
=
name
.
strip
(
"/"
).
strip
(
"."
)
href_tags
.
append
(
f
' <a href="
{
name
}
/">
{
name
}
/</a><br/>'
)
href_tags
.
append
(
f
' <a href="
{
name
}
/">
{
name
}
/</a><br/>'
)
return
INDEX_HTML_TEMPLATE
.
format
(
items
=
"
\n
"
.
join
(
href_tags
))
return
INDEX_HTML_TEMPLATE
.
format
(
items
=
"
\n
"
.
join
(
href_tags
)
,
comment
=
comment
)
def
generate_package_index_and_metadata
(
def
generate_package_index_and_metadata
(
wheel_files
:
list
[
WheelFileInfo
],
wheel_base_dir
:
Path
,
index_base_dir
:
Path
wheel_files
:
list
[
WheelFileInfo
],
wheel_base_dir
:
Path
,
index_base_dir
:
Path
,
comment
:
str
=
""
,
)
->
tuple
[
str
,
str
]:
)
->
tuple
[
str
,
str
]:
"""
"""
Generate package index HTML content for a specific package, linking to actual wheel files.
Generate package index HTML content for a specific package, linking to actual wheel files.
...
@@ -119,7 +125,7 @@ def generate_package_index_and_metadata(
...
@@ -119,7 +125,7 @@ def generate_package_index_and_metadata(
file_meta
=
asdict
(
file
)
file_meta
=
asdict
(
file
)
file_meta
[
"path"
]
=
file_path_quoted
file_meta
[
"path"
]
=
file_path_quoted
metadata
.
append
(
file_meta
)
metadata
.
append
(
file_meta
)
index_str
=
INDEX_HTML_TEMPLATE
.
format
(
items
=
"
\n
"
.
join
(
href_tags
))
index_str
=
INDEX_HTML_TEMPLATE
.
format
(
items
=
"
\n
"
.
join
(
href_tags
)
,
comment
=
comment
)
metadata_str
=
json
.
dumps
(
metadata
,
indent
=
2
)
metadata_str
=
json
.
dumps
(
metadata
,
indent
=
2
)
return
index_str
,
metadata_str
return
index_str
,
metadata_str
...
@@ -130,6 +136,7 @@ def generate_index_and_metadata(
...
@@ -130,6 +136,7 @@ def generate_index_and_metadata(
index_base_dir
:
Path
,
index_base_dir
:
Path
,
default_variant
:
str
|
None
=
None
,
default_variant
:
str
|
None
=
None
,
alias_to_default
:
str
|
None
=
None
,
alias_to_default
:
str
|
None
=
None
,
comment
:
str
=
""
,
):
):
"""
"""
Generate index for all wheel files.
Generate index for all wheel files.
...
@@ -140,6 +147,7 @@ def generate_index_and_metadata(
...
@@ -140,6 +147,7 @@ def generate_index_and_metadata(
index_base_dir (Path): Base directory to store index files.
index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any.
default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
...
@@ -233,6 +241,10 @@ def generate_index_and_metadata(
...
@@ -233,6 +241,10 @@ def generate_index_and_metadata(
variant_to_files
[
alias_to_default
]
=
variant_to_files
[
"default"
].
copy
()
variant_to_files
[
alias_to_default
]
=
variant_to_files
[
"default"
].
copy
()
print
(
f
"Alias variant '
{
alias_to_default
}
' created for default variant."
)
print
(
f
"Alias variant '
{
alias_to_default
}
' created for default variant."
)
# Generate comment in HTML header
comment_str
=
f
" (
{
comment
}
)"
if
comment
else
""
comment_tmpl
=
f
"Generated on
{
datetime
.
now
().
isoformat
()
}{
comment_str
}
"
# Generate index for each variant
# Generate index for each variant
subdir_names
=
set
()
subdir_names
=
set
()
for
variant
,
files
in
variant_to_files
.
items
():
for
variant
,
files
in
variant_to_files
.
items
():
...
@@ -252,7 +264,7 @@ def generate_index_and_metadata(
...
@@ -252,7 +264,7 @@ def generate_index_and_metadata(
subdir_names
=
subdir_names
.
union
(
packages
)
subdir_names
=
subdir_names
.
union
(
packages
)
else
:
else
:
# generate project list for this variant directly
# generate project list for this variant directly
project_list_str
=
generate_project_list
(
sorted
(
packages
))
project_list_str
=
generate_project_list
(
sorted
(
packages
)
,
comment_tmpl
)
with
open
(
variant_dir
/
"index.html"
,
"w"
)
as
f
:
with
open
(
variant_dir
/
"index.html"
,
"w"
)
as
f
:
f
.
write
(
project_list_str
)
f
.
write
(
project_list_str
)
...
@@ -262,7 +274,7 @@ def generate_index_and_metadata(
...
@@ -262,7 +274,7 @@ def generate_index_and_metadata(
package_dir
=
variant_dir
/
package
package_dir
=
variant_dir
/
package
package_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
package_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
index_str
,
metadata_str
=
generate_package_index_and_metadata
(
index_str
,
metadata_str
=
generate_package_index_and_metadata
(
package_files
,
wheel_base_dir
,
package_dir
package_files
,
wheel_base_dir
,
package_dir
,
comment
)
)
with
open
(
package_dir
/
"index.html"
,
"w"
)
as
f
:
with
open
(
package_dir
/
"index.html"
,
"w"
)
as
f
:
f
.
write
(
index_str
)
f
.
write
(
index_str
)
...
@@ -270,7 +282,7 @@ def generate_index_and_metadata(
...
@@ -270,7 +282,7 @@ def generate_index_and_metadata(
f
.
write
(
metadata_str
)
f
.
write
(
metadata_str
)
# Generate top-level project list index
# Generate top-level project list index
project_list_str
=
generate_project_list
(
sorted
(
subdir_names
))
project_list_str
=
generate_project_list
(
sorted
(
subdir_names
)
,
comment_tmpl
)
with
open
(
index_base_dir
/
"index.html"
,
"w"
)
as
f
:
with
open
(
index_base_dir
/
"index.html"
,
"w"
)
as
f
:
f
.
write
(
project_list_str
)
f
.
write
(
project_list_str
)
...
@@ -282,6 +294,7 @@ if __name__ == "__main__":
...
@@ -282,6 +294,7 @@ if __name__ == "__main__":
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files
--output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
--comment <comment_string> : (optional) comment string to include in generated HTML files
"""
"""
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
...
@@ -311,6 +324,12 @@ if __name__ == "__main__":
...
@@ -311,6 +324,12 @@ if __name__ == "__main__":
default
=
None
,
default
=
None
,
help
=
"Alias variant name for the default variant"
,
help
=
"Alias variant name for the default variant"
,
)
)
parser
.
add_argument
(
"--comment"
,
type
=
str
,
default
=
""
,
help
=
"Optional comment string to include in generated HTML files"
,
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
@@ -365,5 +384,6 @@ if __name__ == "__main__":
...
@@ -365,5 +384,6 @@ if __name__ == "__main__":
index_base_dir
=
index_base_dir
,
index_base_dir
=
index_base_dir
,
default_variant
=
None
,
default_variant
=
None
,
alias_to_default
=
args
.
alias_to_default
,
alias_to_default
=
args
.
alias_to_default
,
comment
=
args
.
comment
.
strip
(),
)
)
print
(
f
"Successfully generated index and metadata in
{
output_dir
}
"
)
print
(
f
"Successfully generated index and metadata in
{
output_dir
}
"
)
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
View file @
8d75f22e
...
@@ -40,7 +40,8 @@ function cpu_tests() {
...
@@ -40,7 +40,8 @@ function cpu_tests() {
docker
exec
cpu-test bash
-c
"
docker
exec
cpu-test bash
-c
"
set -e
set -e
pytest -x -v -s tests/kernels/test_onednn.py
pytest -x -v -s tests/kernels/test_onednn.py
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving
# basic online serving
docker
exec
cpu-test bash
-c
'
docker
exec
cpu-test bash
-c
'
...
...
.buildkite/scripts/hardware_ci/run-npu-test.sh
View file @
8d75f22e
...
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
...
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
# Define environments
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV DEBIAN_FRONTEND=noninteractive
ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:
${
PYPI_CACHE_PORT
}
/pypi/simple &&
\
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:
${
PYPI_CACHE_PORT
}
/pypi/simple &&
\
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local &&
\
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local &&
\
...
...
.buildkite/scripts/hardware_ci/run-xpu-test.sh
View file @
8d75f22e
...
@@ -38,6 +38,7 @@ docker run \
...
@@ -38,6 +38,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
cd tests
pytest -v -s v1/core
pytest -v -s v1/core
...
@@ -46,6 +47,6 @@ docker run \
...
@@ -46,6 +47,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_
shared_storag
e_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_
exampl
e_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
pytest -v -s v1/test_serial_utils.py
'
'
.buildkite/scripts/run-prime-rl-test.sh
View file @
8d75f22e
...
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
...
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO
=
"https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_REPO
=
"https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR
=
"
${
REPO_ROOT
}
/prime-rl"
PRIME_RL_DIR
=
"
${
REPO_ROOT
}
/prime-rl"
if
command
-v
rocm-smi &> /dev/null
||
command
-v
rocminfo &> /dev/null
;
then
echo
"AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
exit
0
fi
echo
"Setting up Prime-RL integration test environment..."
echo
"Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
# Clean up any existing Prime-RL directory
...
...
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
0 → 100644
View file @
8d75f22e
#!/usr/bin/env bash
set
-euxo
pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD
=
${
1
:-
0
.25
}
NUM_Q
=
${
2
:-
1319
}
PORT
=
${
3
:-
8030
}
OUT_DIR
=
${
OUT_DIR
:-
/tmp/vllm-scheduled
}
mkdir
-p
"
${
OUT_DIR
}
"
wait_for_server
()
{
local
port
=
$1
timeout
600 bash
-c
'
until curl -sf "http://127.0.0.1:'
"
$port
"
'/health" > /dev/null; do
sleep 1
done'
}
MODEL
=
"deepseek-ai/DeepSeek-V2-lite"
# Set BACKENDS based on platform
if
command
-v
rocm-smi &> /dev/null
||
[[
-d
/opt/rocm
]]
||
[[
-n
"
${
ROCM_PATH
:-}
"
]]
;
then
# ROCm platform
BACKENDS
=(
"allgather_reducescatter"
)
# Disable MOE padding for ROCm since it is causing eplb to fail
export
VLLM_ROCM_MOE_PADDING
=
0
else
# Non-ROCm platform (CUDA/other)
BACKENDS
=(
"deepep_high_throughput"
"deepep_low_latency"
)
fi
cleanup
()
{
if
[[
-n
"
${
SERVER_PID
:-}
"
]]
&&
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
;
then
kill
"
${
SERVER_PID
}
"
2>/dev/null
||
true
for
_
in
{
1..20
}
;
do
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
||
break
sleep
0.5
done
kill
-9
"
${
SERVER_PID
}
"
2>/dev/null
||
true
fi
}
trap
cleanup EXIT
for
BACK
in
"
${
BACKENDS
[@]
}
"
;
do
VLLM_DEEP_GEMM_WARMUP
=
skip
\
VLLM_ALL2ALL_BACKEND
=
$BACK
\
vllm serve
"
$MODEL
"
\
--enforce-eager
\
--tensor-parallel-size
2
\
--data-parallel-size
2
\
--enable-expert-parallel
\
--enable-eplb
\
--eplb-config
'{"window_size":200,"step_interval":600,"use_async":true}'
\
--trust-remote-code
\
--max-model-len
2048
\
--port
$PORT
&
SERVER_PID
=
$!
wait_for_server
$PORT
TAG
=
$(
echo
"
$MODEL
"
|
tr
'/: \\n'
'_____'
)
OUT
=
"
${
OUT_DIR
}
/
${
TAG
}
_
${
BACK
}
_async_eplb.json"
python3 tests/evals/gsm8k/gsm8k_eval.py
--host
http://127.0.0.1
--port
$PORT
--num-questions
${
NUM_Q
}
--save-results
${
OUT
}
python3 -
<<
PY
import json; acc=json.load(open('
${
OUT
}
'))['accuracy']
print(f"
${
MODEL
}
${
BACK
}
: accuracy {acc:.3f}")
assert acc >=
${
THRESHOLD
}
, f"
${
MODEL
}
${
BACK
}
accuracy {acc}"
PY
cleanup
SERVER_PID
=
sleep
1
PORT
=
$((
PORT+1
))
done
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
View file @
8d75f22e
...
@@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
...
@@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
--data-parallel-size
2
\
--data-parallel-size
2
\
--enable-expert-parallel
\
--enable-expert-parallel
\
--enable-eplb
\
--enable-eplb
\
--eplb-config
'{"window_size":200,"step_interval":600}'
\
--trust-remote-code
\
--trust-remote-code
\
--max-model-len
2048
\
--max-model-len
2048
\
--port
$PORT
&
--port
$PORT
&
...
...
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
0 → 100644
View file @
8d75f22e
#!/usr/bin/env bash
set
-euxo
pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD
=
${
1
:-
0
.25
}
NUM_Q
=
${
2
:-
1319
}
PORT
=
${
3
:-
8040
}
OUT_DIR
=
${
OUT_DIR
:-
/tmp/vllm-scheduled
}
mkdir
-p
"
${
OUT_DIR
}
"
wait_for_server
()
{
local
port
=
$1
timeout
600 bash
-c
'
until curl -sf "http://127.0.0.1:'
"
$port
"
'/health" > /dev/null; do
sleep 1
done'
}
MODEL
=
"Qwen/Qwen3-Next-80B-A3B-Instruct"
# Set BACKENDS based on platform
if
command
-v
rocm-smi &> /dev/null
||
[[
-d
/opt/rocm
]]
||
[[
-n
"
${
ROCM_PATH
:-}
"
]]
;
then
# ROCm platform
BACKENDS
=(
"allgather_reducescatter"
)
# Disable MOE padding for ROCm since it is causing eplb to fail
export
VLLM_ROCM_MOE_PADDING
=
0
else
# Non-ROCm platform (CUDA/other)
BACKENDS
=(
"deepep_high_throughput"
"deepep_low_latency"
)
fi
cleanup
()
{
if
[[
-n
"
${
SERVER_PID
:-}
"
]]
&&
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
;
then
kill
"
${
SERVER_PID
}
"
2>/dev/null
||
true
for
_
in
{
1..20
}
;
do
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
||
break
sleep
0.5
done
kill
-9
"
${
SERVER_PID
}
"
2>/dev/null
||
true
fi
}
trap
cleanup EXIT
for
BACK
in
"
${
BACKENDS
[@]
}
"
;
do
VLLM_DEEP_GEMM_WARMUP
=
skip
\
VLLM_ALL2ALL_BACKEND
=
$BACK
\
vllm serve
"
$MODEL
"
\
--enforce-eager
\
--tensor-parallel-size
4
\
--enable-expert-parallel
\
--enable-eplb
\
--eplb-config
'{"window_size":200,"step_interval":600,"use_async":true}'
\
--speculative-config
'{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
\
--trust-remote-code
\
--max-model-len
2048
\
--gpu-memory-utilization
0.9
\
--port
$PORT
&
SERVER_PID
=
$!
wait_for_server
$PORT
TAG
=
$(
echo
"
$MODEL
"
|
tr
'/: \\n'
'_____'
)
OUT
=
"
${
OUT_DIR
}
/
${
TAG
}
_
${
BACK
}
.json"
python3 tests/evals/gsm8k/gsm8k_eval.py
--host
http://127.0.0.1
--port
$PORT
--num-questions
${
NUM_Q
}
--save-results
${
OUT
}
python3 -
<<
PY
import json; acc=json.load(open('
${
OUT
}
'))['accuracy']
print(f"
${
MODEL
}
${
BACK
}
: accuracy {acc:.3f}")
assert acc >=
${
THRESHOLD
}
, f"
${
MODEL
}
${
BACK
}
accuracy {acc}"
PY
cleanup
SERVER_PID
=
sleep
1
PORT
=
$((
PORT+1
))
done
.buildkite/scripts/upload-wheels.sh
View file @
8d75f22e
...
@@ -81,7 +81,10 @@ else
...
@@ -81,7 +81,10 @@ else
alias_arg
=
""
alias_arg
=
""
fi
fi
$PYTHON
.buildkite/scripts/generate-nightly-index.py
--version
"
$SUBPATH
"
--current-objects
"
$obj_json
"
--output-dir
"
$INDICES_OUTPUT_DIR
"
$alias_arg
# HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module
sed
-i
's/import regex as re/import re/g'
.buildkite/scripts/generate-nightly-index.py
$PYTHON
.buildkite/scripts/generate-nightly-index.py
--version
"
$SUBPATH
"
--current-objects
"
$obj_json
"
--output-dir
"
$INDICES_OUTPUT_DIR
"
--comment
"commit
$BUILDKITE_COMMIT
"
$alias_arg
# copy indices to /<commit>/ unconditionally
# copy indices to /<commit>/ unconditionally
echo
"Uploading indices to
$S3_COMMIT_PREFIX
"
echo
"Uploading indices to
$S3_COMMIT_PREFIX
"
...
...
.buildkite/test-amd.yaml
View file @
8d75f22e
...
@@ -398,7 +398,8 @@ steps:
...
@@ -398,7 +398,8 @@ steps:
timeout_in_minutes
:
25
timeout_in_minutes
:
25
gpu
:
h100
gpu
:
h100
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/v1/attention
-
vllm/model_executor/layers
-
tests/v1/determinism/
-
tests/v1/determinism/
commands
:
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
...
@@ -440,23 +441,29 @@ steps:
...
@@ -440,23 +441,29 @@ steps:
working_dir
:
"
/vllm-workspace/examples"
working_dir
:
"
/vllm-workspace/examples"
source_file_dependencies
:
source_file_dependencies
:
-
vllm/entrypoints
-
vllm/entrypoints
-
vllm/multimodal
-
examples/
-
examples/
commands
:
commands
:
-
pip install tensorizer
# for tensorizer test
-
pip install tensorizer
# for tensorizer test
# for basic
-
python3 offline_inference/basic/chat.py
-
python3 offline_inference/basic/generate.py --model facebook/opt-125m
-
python3 offline_inference/basic/generate.py --model facebook/opt-125m
-
python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb
10
-
python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb
10
-
python3 offline_inference/basic/chat.py
-
python3 offline_inference/basic/classify.py
-
python3 offline_inference/prefix_caching.py
-
python3 offline_inference/basic/embed.py
-
python3 offline_inference/llm_engine_example.py
-
python3 offline_inference/basic/score.py
# for multi-modal models
-
python3 offline_inference/audio_language.py --seed
0
-
python3 offline_inference/audio_language.py --seed
0
-
python3 offline_inference/vision_language.py --seed
0
-
python3 offline_inference/vision_language.py --seed
0
-
python3 offline_inference/vision_language_pooling.py --seed
0
-
python3 offline_inference/vision_language_pooling.py --seed
0
-
python3 offline_inference/vision_language_multi_image.py --seed
0
-
python3 offline_inference/vision_language_multi_image.py --seed
0
-
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed
0
-
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed
0
-
python3 offline_inference/basic/classify.py
# for pooling models
-
python3 offline_inference/basic/embed.py
-
python3 pooling/pooling/vision_language_pooling.py --seed
0
-
python3 offline_inference/basic/score.py
# for features demo
-
python3 offline_inference/prefix_caching.py
-
python3 offline_inference/llm_engine_example.py
-
python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
2048
-
python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-
python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
1536
-
python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len
1536
...
@@ -715,16 +722,18 @@ steps:
...
@@ -715,16 +722,18 @@ steps:
# we can only upgrade after this is resolved
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
# TODO(jerryzh168): resolve the above comment
-
uv pip install --system torchao==0.13.0
-
uv pip install --system torchao==0.13.0
-
uv pip install --system conch-triton-kernels
-
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-
label
:
LM Eval Small Models
#
1
5min
-
label
:
LM Eval Small Models
# 5
3
min
timeout_in_minutes
:
20
timeout_in_minutes
:
75
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
agent_pool
:
mi325_1
# grade: Blocking
# grade: Blocking
source_file_dependencies
:
source_file_dependencies
:
-
csrc/
-
csrc/
-
vllm/model_executor/layers/quantization
-
vllm/model_executor/layers/quantization
autorun_on_main
:
true
commands
:
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
...
@@ -737,7 +746,7 @@ steps:
...
@@ -737,7 +746,7 @@ steps:
-
csrc/
-
csrc/
-
vllm/entrypoints/openai/
-
vllm/entrypoints/openai/
-
vllm/model_executor/models/whisper.py
-
vllm/model_executor/models/whisper.py
commands
:
# LMEval
commands
:
# LMEval
+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
-
pytest -s entrypoints/openai/correctness/
-
pytest -s entrypoints/openai/correctness/
...
@@ -934,6 +943,18 @@ steps:
...
@@ -934,6 +943,18 @@ steps:
commands
:
commands
:
-
pytest -v -s models/language/pooling_mteb_test
-
pytest -v -s models/language/pooling_mteb_test
-
label
:
Multi-Modal Processor Test (CPU)
timeout_in_minutes
:
60
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
source_file_dependencies
:
-
vllm/
-
tests/models/multimodal
no_gpu
:
true
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-
label
:
Multi-Modal Processor Test
# 44min
-
label
:
Multi-Modal Processor Test
# 44min
timeout_in_minutes
:
60
timeout_in_minutes
:
60
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
]
...
@@ -961,8 +982,8 @@ steps:
...
@@ -961,8 +982,8 @@ steps:
-
pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
-
pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
-
cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model
# Otherwise, mp_method="spawn" doesn't work
-
cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model
# Otherwise, mp_method="spawn" doesn't work
-
label
:
Multi-Modal Accuracy Eval (Small Models)
# 10min
-
label
:
Multi-Modal Accuracy Eval (Small Models)
# 1
50min - 18
0min
timeout_in_minutes
:
7
0
timeout_in_minutes
:
18
0
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
agent_pool
:
mi325_1
# grade: Blocking
# grade: Blocking
...
@@ -974,7 +995,8 @@ steps:
...
@@ -974,7 +995,8 @@ steps:
commands
:
commands
:
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
label
:
Multi-Modal Models Test (Extended)
1
-
label
:
Multi-Modal Models Test (Extended)
1
# 60min
timeout_in_minutes
:
120
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
agent_pool
:
mi325_1
# grade: Blocking
# grade: Blocking
...
@@ -998,7 +1020,8 @@ steps:
...
@@ -998,7 +1020,8 @@ steps:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
-
pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
-
label
:
Multi-Modal Models Test (Extended)
3
-
label
:
Multi-Modal Models Test (Extended)
3
# 75min
timeout_in_minutes
:
150
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_1
agent_pool
:
mi325_1
# grade: Blocking
# grade: Blocking
...
@@ -1107,7 +1130,6 @@ steps:
...
@@ -1107,7 +1130,6 @@ steps:
-
vllm/model_executor/layers/layernorm.py
-
vllm/model_executor/layers/layernorm.py
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
vllm/model_executor/layers/fused_moe/layer.py
-
tests/compile/test_fusion_attn.py
-
tests/compile/test_fusion_attn.py
-
tests/compile/test_silu_mul_quant_fusion.py
-
tests/compile/test_silu_mul_quant_fusion.py
-
tests/compile/distributed/test_fusion_all_reduce.py
-
tests/compile/distributed/test_fusion_all_reduce.py
...
@@ -1141,17 +1163,15 @@ steps:
...
@@ -1141,17 +1163,15 @@ steps:
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
tests/compile/distributed/test_fusions_e2e.py
-
tests/compile/distributed/test_fusions_e2e.py
-
tests/compile/fullgraph/test_full_graph.py
commands
:
commands
:
-
nvidia-smi
-
nvidia-smi
# Run all e2e fusion tests
# Run all e2e fusion tests
-
pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
label
:
ROCm
GPT-OSS Eval
-
label
:
Blackwell
GPT-OSS Eval
timeout_in_minutes
:
60
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/"
working_dir
:
"
/vllm-workspace/"
agent_pool
:
mi325_1
gpu
:
b200
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
optional
:
true
# run on nightlies
optional
:
true
# run on nightlies
source_file_dependencies
:
source_file_dependencies
:
-
tests/evals/gpt_oss
-
tests/evals/gpt_oss
...
@@ -1160,7 +1180,7 @@ steps:
...
@@ -1160,7 +1180,7 @@ steps:
-
vllm/v1/attention/backends/flashinfer.py
-
vllm/v1/attention/backends/flashinfer.py
commands
:
commands
:
-
uv pip install --system 'gpt-oss[eval]==0.0.5'
-
uv pip install --system 'gpt-oss[eval]==0.0.5'
-
VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1
pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric
0.58
-
pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric
0.58
-
label
:
Blackwell Quantized MoE Test
-
label
:
Blackwell Quantized MoE Test
timeout_in_minutes
:
60
timeout_in_minutes
:
60
...
@@ -1365,7 +1385,7 @@ steps:
...
@@ -1365,7 +1385,7 @@ steps:
-
pytest -v -s -x lora/test_llm_with_multi_loras.py
-
pytest -v -s -x lora/test_llm_with_multi_loras.py
-
pytest -v -s -x lora/test_olmoe_tp.py
-
pytest -v -s -x lora/test_olmoe_tp.py
# Disabled for now because MXFP4 backend on non-cuda platform
# Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet
# doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py
#- pytest -v -s -x lora/test_gptoss_tp.py
...
@@ -1431,12 +1451,13 @@ steps:
...
@@ -1431,12 +1451,13 @@ steps:
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-
pytest -v -s -x lora/test_mixtral.py
-
pytest -v -s -x lora/test_mixtral.py
-
label
:
LM Eval Large Models
# optional
-
label
:
LM Eval Large Models
# optional
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
a100
gpu
:
a100
optional
:
true
optional
:
true
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_4
# grade: Blocking
num_gpus
:
4
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
source_file_dependencies
:
...
@@ -1448,11 +1469,11 @@ steps:
...
@@ -1448,11 +1469,11 @@ steps:
##### H100 test #####
##### H100 test #####
-
label
:
LM Eval Large Models (H100)
# optional
-
label
:
LM Eval Large Models (H100)
# optional
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
h100
gpu
:
h100
optional
:
true
optional
:
true
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_4
# grade: Blocking
num_gpus
:
4
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
source_file_dependencies
:
...
@@ -1462,6 +1483,7 @@ steps:
...
@@ -1462,6 +1483,7 @@ steps:
-
export VLLM_USE_DEEP_GEMM=0
# We found Triton is faster than DeepGEMM for H100
-
export VLLM_USE_DEEP_GEMM=0
# We found Triton is faster than DeepGEMM for H100
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test #####
##### H200 test #####
-
label
:
Distributed Tests (H200)
# optional
-
label
:
Distributed Tests (H200)
# optional
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
]
...
@@ -1472,14 +1494,14 @@ steps:
...
@@ -1472,14 +1494,14 @@ steps:
working_dir
:
"
/vllm-workspace/"
working_dir
:
"
/vllm-workspace/"
num_gpus
:
2
num_gpus
:
2
commands
:
commands
:
-
pytest -v -s tests/compile/distributed/test_async_tp.py
-
VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest -v -s tests/compile/distributed/test_async_tp.py
-
pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-
pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-
pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-
pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-
"
pytest
-v
-s
tests/compile/distributed/test_fusions_e2e.py
-k
'not
Llama-4'"
-
"
VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest
-v
-s
tests/compile/distributed/test_fusions_e2e.py
-k
'not
Llama-4'"
-
pytest -v -s tests/distributed/test_sequence_parallel.py
-
VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest -v -s tests/distributed/test_sequence_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
pytest -v -s tests/distributed/test_context_parallel.py
-
CUDA
_VISIBLE_DEVICES=
1,2
VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len
2048
-
HIP
_VISIBLE_DEVICES=
0,1
VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len
2048
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
##### B200 test #####
...
@@ -1493,6 +1515,57 @@ steps:
...
@@ -1493,6 +1515,57 @@ steps:
-
pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-
pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-
pytest -v -s tests/v1/distributed/test_dbo.py
-
pytest -v -s tests/v1/distributed/test_dbo.py
##### E2E Eval Tests #####
-
label
:
LM Eval Small Models (1 Card)
# 15min
timeout_in_minutes
:
20
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-
label
:
LM Eval Large Models (4 Card)
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
a100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
label
:
ROCm LM Eval Large Models (8 Card)
mirror_hardwares
:
[
amdproduction
]
agent_pool
:
mi325_8
num_gpus
:
8
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
-
label
:
ROCm GPT-OSS Eval
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/"
agent_pool
:
mi325_1
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
optional
:
true
# run on nightlies
source_file_dependencies
:
-
tests/evals/gpt_oss
-
vllm/model_executor/models/gpt_oss.py
-
vllm/model_executor/layers/quantization/mxfp4.py
-
vllm/v1/attention/backends/flashinfer.py
commands
:
-
uv pip install --system 'gpt-oss[eval]==0.0.5'
-
VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric
0.58
##### RL Integration Tests #####
##### RL Integration Tests #####
-
label
:
Prime-RL Integration Test
# 15min
-
label
:
Prime-RL Integration Test
# 15min
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
]
...
@@ -1507,7 +1580,6 @@ steps:
...
@@ -1507,7 +1580,6 @@ steps:
-
.buildkite/scripts/run-prime-rl-test.sh
-
.buildkite/scripts/run-prime-rl-test.sh
commands
:
commands
:
-
bash .buildkite/scripts/run-prime-rl-test.sh
-
bash .buildkite/scripts/run-prime-rl-test.sh
-
label
:
DeepSeek V2-Lite Accuracy
-
label
:
DeepSeek V2-Lite Accuracy
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_4
agent_pool
:
mi325_4
...
@@ -1539,4 +1611,28 @@ steps:
...
@@ -1539,4 +1611,28 @@ steps:
num_gpus
:
2
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
working_dir
:
"
/vllm-workspace"
commands
:
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2
1
\ No newline at end of file
-
label
:
DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes
:
60
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319
8030
-
label
:
Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes
:
60
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319
8040
Prev
1
2
3
4
5
…
36
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment