Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a3f8d5dd
Commit
a3f8d5dd
authored
Dec 17, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori
parents
8d75f22e
f34eca5f
Changes
499
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
249 additions
and
168 deletions
+249
-168
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+32
-3
.buildkite/scripts/generate-nightly-index.py
.buildkite/scripts/generate-nightly-index.py
+11
-0
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+5
-0
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
...eduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
+0
-73
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
...ts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+0
-1
.buildkite/scripts/upload-wheels.sh
.buildkite/scripts/upload-wheels.sh
+8
-4
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+9
-21
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+9
-31
.buildkite/test_areas/misc.yaml
.buildkite/test_areas/misc.yaml
+3
-1
.buildkite/test_areas/tool_use.yaml
.buildkite/test_areas/tool_use.yaml
+1
-11
CMakeLists.txt
CMakeLists.txt
+3
-3
README_ORIGIN.md
README_ORIGIN.md
+2
-0
benchmarks/auto_tune/auto_tune.sh
benchmarks/auto_tune/auto_tune.sh
+11
-2
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+1
-1
benchmarks/benchmark_ngram_proposer.py
benchmarks/benchmark_ngram_proposer.py
+1
-2
benchmarks/benchmark_serving_structured_output.py
benchmarks/benchmark_serving_structured_output.py
+1
-1
benchmarks/kernels/benchmark_mla_k_concat.py
benchmarks/kernels/benchmark_mla_k_concat.py
+150
-0
benchmarks/kernels/benchmark_mrope.py
benchmarks/kernels/benchmark_mrope.py
+0
-1
benchmarks/kernels/benchmark_rope.py
benchmarks/kernels/benchmark_rope.py
+2
-2
cmake/cpu_extension.cmake
cmake/cpu_extension.cmake
+0
-11
No files found.
.buildkite/release-pipeline.yaml
View file @
a3f8d5dd
...
...
@@ -15,6 +15,21 @@ steps:
env
:
DOCKER_BUILDKIT
:
"
1"
-
label
:
"
Build
arm64
wheel
-
CUDA
13.0"
depends_on
:
~
id
:
build-wheel-arm64-cuda-13-0
agents
:
queue
:
arm64_cpu_queue_postmerge
commands
:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_35"
env
:
DOCKER_BUILDKIT
:
"
1"
# aarch64 build
-
label
:
"
Build
arm64
CPU
wheel"
depends_on
:
~
...
...
@@ -25,7 +40,7 @@ steps:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_BUILD_ACL=ON
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_35
"
env
:
DOCKER_BUILDKIT
:
"
1"
...
...
@@ -39,7 +54,7 @@ steps:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_31
"
env
:
DOCKER_BUILDKIT
:
"
1"
...
...
@@ -52,7 +67,21 @@ steps:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_35"
env
:
DOCKER_BUILDKIT
:
"
1"
# x86 CPU wheel build
-
label
:
"
Build
x86
CPU
wheel"
depends_on
:
~
id
:
build-wheel-x86-cpu
agents
:
queue
:
cpu_queue_postmerge
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_AVX512BF16=true
--build-arg
VLLM_CPU_AVX512VNNI=true
--build-arg
VLLM_CPU_AMXBF16=true
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_35"
env
:
DOCKER_BUILDKIT
:
"
1"
...
...
.buildkite/scripts/generate-nightly-index.py
View file @
a3f8d5dd
...
...
@@ -372,6 +372,17 @@ if __name__ == "__main__":
print
(
f
"Found
{
len
(
wheel_files
)
}
wheel files for version
{
version
}
:
{
wheel_files
}
"
)
# keep only "official" files for a non-nightly version (specifed by cli args)
PY_VERSION_RE
=
re
.
compile
(
r
"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$"
)
if
PY_VERSION_RE
.
match
(
version
):
# upload-wheels.sh ensures no "dev" is in args.version
wheel_files
=
list
(
filter
(
lambda
x
:
version
in
x
and
"dev"
not
in
x
,
wheel_files
)
)
print
(
f
"Non-nightly version detected, wheel files used:
{
wheel_files
}
"
)
else
:
print
(
"Nightly version detected, keeping all wheel files."
)
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>
...
...
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
View file @
a3f8d5dd
...
...
@@ -36,6 +36,11 @@ function cpu_tests() {
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run model tests
docker
exec
cpu-test bash
-c
"
set -e
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
# Run kernel tests
docker
exec
cpu-test bash
-c
"
set -e
...
...
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
deleted
100644 → 0
View file @
8d75f22e
#!/usr/bin/env bash
set
-euxo
pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD
=
${
1
:-
0
.25
}
NUM_Q
=
${
2
:-
1319
}
PORT
=
${
3
:-
8030
}
OUT_DIR
=
${
OUT_DIR
:-
/tmp/vllm-scheduled
}
mkdir
-p
"
${
OUT_DIR
}
"
wait_for_server
()
{
local
port
=
$1
timeout
600 bash
-c
'
until curl -sf "http://127.0.0.1:'
"
$port
"
'/health" > /dev/null; do
sleep 1
done'
}
MODEL
=
"deepseek-ai/DeepSeek-V2-lite"
# Set BACKENDS based on platform
if
command
-v
rocm-smi &> /dev/null
||
[[
-d
/opt/rocm
]]
||
[[
-n
"
${
ROCM_PATH
:-}
"
]]
;
then
# ROCm platform
BACKENDS
=(
"allgather_reducescatter"
)
# Disable MOE padding for ROCm since it is causing eplb to fail
export
VLLM_ROCM_MOE_PADDING
=
0
else
# Non-ROCm platform (CUDA/other)
BACKENDS
=(
"deepep_high_throughput"
"deepep_low_latency"
)
fi
cleanup
()
{
if
[[
-n
"
${
SERVER_PID
:-}
"
]]
&&
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
;
then
kill
"
${
SERVER_PID
}
"
2>/dev/null
||
true
for
_
in
{
1..20
}
;
do
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
||
break
sleep
0.5
done
kill
-9
"
${
SERVER_PID
}
"
2>/dev/null
||
true
fi
}
trap
cleanup EXIT
for
BACK
in
"
${
BACKENDS
[@]
}
"
;
do
VLLM_DEEP_GEMM_WARMUP
=
skip
\
VLLM_ALL2ALL_BACKEND
=
$BACK
\
vllm serve
"
$MODEL
"
\
--enforce-eager
\
--tensor-parallel-size
2
\
--data-parallel-size
2
\
--enable-expert-parallel
\
--enable-eplb
\
--eplb-config
'{"window_size":200,"step_interval":600,"use_async":true}'
\
--trust-remote-code
\
--max-model-len
2048
\
--port
$PORT
&
SERVER_PID
=
$!
wait_for_server
$PORT
TAG
=
$(
echo
"
$MODEL
"
|
tr
'/: \\n'
'_____'
)
OUT
=
"
${
OUT_DIR
}
/
${
TAG
}
_
${
BACK
}
_async_eplb.json"
python3 tests/evals/gsm8k/gsm8k_eval.py
--host
http://127.0.0.1
--port
$PORT
--num-questions
${
NUM_Q
}
--save-results
${
OUT
}
python3 -
<<
PY
import json; acc=json.load(open('
${
OUT
}
'))['accuracy']
print(f"
${
MODEL
}
${
BACK
}
: accuracy {acc:.3f}")
assert acc >=
${
THRESHOLD
}
, f"
${
MODEL
}
${
BACK
}
accuracy {acc}"
PY
cleanup
SERVER_PID
=
sleep
1
PORT
=
$((
PORT+1
))
done
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
View file @
a3f8d5dd
...
...
@@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
--data-parallel-size
2
\
--enable-expert-parallel
\
--enable-eplb
\
--eplb-config
'{"window_size":200,"step_interval":600}'
\
--trust-remote-code
\
--max-model-len
2048
\
--port
$PORT
&
...
...
.buildkite/scripts/upload-wheels.sh
View file @
a3f8d5dd
...
...
@@ -34,9 +34,10 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
fi
wheel
=
"
${
wheel_files
[0]
}
"
# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
# we also accept params as manylinux tag
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
manylinux_version
=
"manylinux_2_31"
manylinux_version
=
"
${
1
:-
manylinux_2_31
}
"
# Rename 'linux' to the appropriate manylinux version in the wheel filename
if
[[
"
$wheel
"
!=
*
"linux"
*
]]
;
then
...
...
@@ -96,8 +97,11 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]];
aws s3
cp
--recursive
"
$INDICES_OUTPUT_DIR
/"
"s3://
$BUCKET
/nightly/"
fi
# copy to /<pure_version>/ only if it does not have "dev" in the version
#
re-generate and
copy to /<pure_version>/ only if it does not have "dev" in the version
if
[[
"
$version
"
!=
*
"dev"
*
]]
;
then
echo
"Uploading indices to overwrite /
$pure_version
/"
echo
"Re-generating indices for /
$pure_version
/"
rm
-rf
"
$INDICES_OUTPUT_DIR
/*"
mkdir
-p
"
$INDICES_OUTPUT_DIR
"
$PYTHON
.buildkite/scripts/generate-nightly-index.py
--version
"
$pure_version
"
--current-objects
"
$obj_json
"
--output-dir
"
$INDICES_OUTPUT_DIR
"
--comment
"version
$pure_version
"
$alias_arg
aws s3
cp
--recursive
"
$INDICES_OUTPUT_DIR
/"
"s3://
$BUCKET
/
$pure_version
/"
fi
.buildkite/test-amd.yaml
View file @
a3f8d5dd
...
...
@@ -61,8 +61,8 @@ steps:
-
pytest -v -s -m 'not cpu_test' multimodal
-
pytest -v -s utils_
-
label
:
Async Engine, Inputs, Utils, Worker, Config Test (CPU)
#
15
min
timeout_in_minutes
:
2
0
-
label
:
Async Engine, Inputs, Utils, Worker, Config Test (CPU)
#
20
min
timeout_in_minutes
:
3
0
mirror_hardwares
:
[
amdexperimental
,
amdproduction
,
amdtentative
]
agent_pool
:
mi325_1
grade
:
Blocking
...
...
@@ -73,6 +73,7 @@ steps:
-
tests/multimodal
-
tests/standalone_tests/lazy_imports.py
-
tests/tokenizers_
-
tests/tool_parsers
-
tests/transformers_utils
-
tests/config
no_gpu
:
true
...
...
@@ -82,6 +83,7 @@ steps:
-
pytest -v -s test_outputs.py
-
pytest -v -s -m 'cpu_test' multimodal
-
pytest -v -s tokenizers_
-
pytest -v -s tool_parsers
-
pytest -v -s transformers_utils
-
pytest -v -s config
...
...
@@ -326,10 +328,10 @@ steps:
commands
:
-
pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-
label
:
V1 Test e2e + engine
#
30
min
timeout_in_minutes
:
45
-
label
:
V1 Test e2e + engine
#
65
min
timeout_in_minutes
:
90
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_
1
agent_pool
:
mi325_
4
# grade: Blocking
source_file_dependencies
:
-
vllm/
...
...
@@ -435,7 +437,7 @@ steps:
-
label
:
Examples Test
# 30min
timeout_in_minutes
:
45
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
working_dir
:
"
/vllm-workspace/examples"
...
...
@@ -455,7 +457,6 @@ steps:
# for multi-modal models
-
python3 offline_inference/audio_language.py --seed
0
-
python3 offline_inference/vision_language.py --seed
0
-
python3 offline_inference/vision_language_pooling.py --seed
0
-
python3 offline_inference/vision_language_multi_image.py --seed
0
-
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed
0
# for pooling models
...
...
@@ -760,19 +761,7 @@ steps:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s -m 'not cpu_test' tool_use
-
label
:
OpenAI-Compatible Tool Use (CPU)
# 5 mins
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
timeout_in_minutes
:
10
source_file_dependencies
:
-
vllm/
-
tests/tool_use
no_gpu
:
true
commands
:
-
pytest -v -s -m 'cpu_test' tool_use
-
pytest -v -s tool_use
##### models test #####
...
...
@@ -1630,7 +1619,6 @@ steps:
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
...
...
.buildkite/test-pipeline.yaml
View file @
a3f8d5dd
...
...
@@ -57,8 +57,8 @@ steps:
-
pytest -v -s -m 'not cpu_test' multimodal
-
pytest -v -s utils_
-
label
:
Async Engine, Inputs, Utils, Worker, Config Test (CPU)
#
15
min
timeout_in_minutes
:
2
0
-
label
:
Async Engine, Inputs, Utils, Worker, Config Test (CPU)
#
20
min
timeout_in_minutes
:
3
0
source_file_dependencies
:
-
vllm/
-
tests/test_inputs.py
...
...
@@ -66,6 +66,7 @@ steps:
-
tests/multimodal
-
tests/standalone_tests/lazy_imports.py
-
tests/tokenizers_
-
tests/tool_parsers
-
tests/transformers_utils
-
tests/config
no_gpu
:
true
...
...
@@ -75,6 +76,7 @@ steps:
-
pytest -v -s test_outputs.py
-
pytest -v -s -m 'cpu_test' multimodal
-
pytest -v -s tokenizers_
-
pytest -v -s tool_parsers
-
pytest -v -s transformers_utils
-
pytest -v -s config
...
...
@@ -672,16 +674,7 @@ steps:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s -m 'not cpu_test' tool_use
-
label
:
OpenAI-Compatible Tool Use (CPU)
# 5 mins
timeout_in_minutes
:
10
source_file_dependencies
:
-
vllm/
-
tests/tool_use
no_gpu
:
true
commands
:
-
pytest -v -s -m 'cpu_test' tool_use
-
pytest -v -s tool_use
##### models test #####
...
...
@@ -692,6 +685,7 @@ steps:
source_file_dependencies
:
-
vllm/
-
tests/models/test_initialization.py
-
tests/models/registry.py
commands
:
# Run a subset of model initialization tests
-
pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
...
...
@@ -704,6 +698,7 @@ steps:
-
vllm/model_executor/models/
-
vllm/transformers_utils/
-
tests/models/test_initialization.py
-
tests/models/registry.py
commands
:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
...
...
@@ -836,7 +831,7 @@ steps:
-
tests/models/multimodal
no_gpu
:
true
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
"
pip
install
git+https://github.com/TIGER-AI-Lab/Mantis.git
||
echo
'Mantis
installation
skipped
(decord
not
available
on
CPU-only
environment)'"
-
pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-
label
:
Multi-Modal Processor Test
...
...
@@ -1346,6 +1341,7 @@ steps:
-
label
:
Prime-RL Integration Test
# 15min
timeout_in_minutes
:
30
optional
:
true
soft_fail
:
true
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
source_file_dependencies
:
...
...
@@ -1380,21 +1376,3 @@ steps:
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2
1
-
label
:
DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319
8030
-
label
:
Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319
8040
.buildkite/test_areas/misc.yaml
View file @
a3f8d5dd
...
...
@@ -115,7 +115,7 @@ steps:
-
label
:
Async Engine, Inputs, Utils, Worker, Config (CPU)
depends_on
:
~
timeout_in_minutes
:
2
0
timeout_in_minutes
:
3
0
source_file_dependencies
:
-
vllm/
-
tests/test_inputs.py
...
...
@@ -123,6 +123,7 @@ steps:
-
tests/multimodal
-
tests/standalone_tests/lazy_imports.py
-
tests/tokenizers_
-
tests/tool_parsers
-
tests/transformers_utils
-
tests/config
no_gpu
:
true
...
...
@@ -132,6 +133,7 @@ steps:
-
pytest -v -s test_outputs.py
-
pytest -v -s -m 'cpu_test' multimodal
-
pytest -v -s tokenizers_
-
pytest -v -s tool_parsers
-
pytest -v -s transformers_utils
-
pytest -v -s config
...
...
.buildkite/test_areas/tool_use.yaml
View file @
a3f8d5dd
...
...
@@ -10,14 +10,4 @@ steps:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s -m 'not cpu_test' tool_use
-
label
:
OpenAI-Compatible Tool Use (CPU)
depends_on
:
~
timeout_in_minutes
:
10
source_file_dependencies
:
-
vllm/
-
tests/tool_use
no_gpu
:
true
commands
:
-
pytest -v -s -m 'cpu_test' tool_use
-
pytest -v -s tool_use
CMakeLists.txt
View file @
a3f8d5dd
...
...
@@ -384,7 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL
${
MARLIN_GEN_SCRIPT_HASH_AND_ARCH
}
)
execute_process
(
COMMAND
${
CMAKE_COMMAND
}
-E env
PYTHONPATH=$PYTHONPATH
PYTHONPATH=$
ENV{
PYTHONPATH
}
${
Python_EXECUTABLE
}
${
MARLIN_GEN_SCRIPT
}
${
CUDA_ARCHS_STR
}
RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result
...
...
@@ -822,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL
${
MACHETE_GEN_SCRIPT_HASH
}
)
execute_process
(
COMMAND
${
CMAKE_COMMAND
}
-E env
PYTHONPATH=
${
CMAKE_CURRENT_SOURCE_DIR
}
/csrc/cutlass_extensions/:
${
CUTLASS_DIR
}
/python/:
${
VLLM_PYTHON_PATH
}
:$PYTHONPATH
PYTHONPATH=
${
CMAKE_CURRENT_SOURCE_DIR
}
/csrc/cutlass_extensions/:
${
CUTLASS_DIR
}
/python/:
${
VLLM_PYTHON_PATH
}
:$
ENV{
PYTHONPATH
}
${
Python_EXECUTABLE
}
${
MACHETE_GEN_SCRIPT
}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
...
...
@@ -1004,7 +1004,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL
${
MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH
}
)
execute_process
(
COMMAND
${
CMAKE_COMMAND
}
-E env
PYTHONPATH=$PYTHONPATH
PYTHONPATH=$
ENV{
PYTHONPATH
}
${
Python_EXECUTABLE
}
${
MOE_MARLIN_GEN_SCRIPT
}
${
CUDA_ARCHS_STR
}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
...
...
README_ORIGIN.md
View file @
a3f8d5dd
...
...
@@ -143,11 +143,13 @@ Compute Resources:
-
Databricks
-
DeepInfra
-
Google Cloud
-
IBM
-
Intel
-
Lambda Lab
-
Nebius
-
Novita AI
-
NVIDIA
-
Red Hat
-
Replicate
-
Roblox
-
RunPod
...
...
benchmarks/auto_tune/auto_tune.sh
View file @
a3f8d5dd
...
...
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS
=
${
MAX_LATENCY_ALLOWED_MS
:-
100000000000
}
NUM_SEQS_LIST
=
${
NUM_SEQS_LIST
:-
"128 256"
}
NUM_BATCHED_TOKENS_LIST
=
${
NUM_BATCHED_TOKENS_LIST
:-
"512 1024 2048 4096"
}
HOSTNAME
=
$(
hostname
)
if
[[
-z
"
$HOSTNAME
"
]]
;
then
echo
"Error: Failed to determine hostname."
>
&2
exit
1
fi
LOG_FOLDER
=
"
$BASE
/auto-benchmark/
$TAG
"
RESULT
=
"
$LOG_FOLDER
/result.txt"
...
...
@@ -82,6 +87,7 @@ start_server() {
"
$MODEL
"
"--disable-log-requests"
"--port"
"8004"
"--host"
"
$HOSTNAME
"
"--gpu-memory-utilization"
"
$gpu_memory_utilization
"
"--max-num-seqs"
"
$max_num_seqs
"
"--max-num-batched-tokens"
"
$max_num_batched_tokens
"
...
...
@@ -113,7 +119,7 @@ start_server() {
# since that we should always have permission to send signal to the server process.
kill
-0
$server_pid
2> /dev/null
||
break
RESPONSE
=
$(
curl
-s
-X
GET
"http://
0.0.0.0
:8004/health"
-w
"%{http_code}"
-o
/dev/stdout
)
RESPONSE
=
$(
curl
-s
-X
GET
"http://
${
HOSTNAME
}
:8004/health"
-w
"%{http_code}"
-o
/dev/stdout
)
STATUS_CODE
=
$(
echo
"
$RESPONSE
"
|
tail
-n
1
)
if
[[
"
$STATUS_CODE
"
-eq
200
]]
;
then
server_started
=
1
...
...
@@ -173,6 +179,7 @@ run_benchmark() {
--goodput
e2el:
$MAX_LATENCY_ALLOWED_MS
\
--num-prompts
1000
\
--random-prefix-len
$prefix_len
\
--host
"
$HOSTNAME
"
\
--port
8004 &>
"
$bm_log
"
throughput
=
$(
grep
"Request throughput (req/s):"
"
$bm_log
"
|
sed
's/[^0-9.]//g'
)
e2el
=
$(
grep
"P99 E2EL (ms):"
"
$bm_log
"
|
awk
'{print $NF}'
)
...
...
@@ -188,7 +195,7 @@ run_benchmark() {
request_rate
=
$((${
throughput
%.*
}
+
1
))
while
((
request_rate
>
0
))
;
do
# clear prefix cache
curl
-X
POST http://
0.0.0.0
:8004/reset_prefix_cache
curl
-X
POST http://
${
HOSTNAME
}
:8004/reset_prefix_cache
sleep
5
bm_log
=
"
$LOG_FOLDER
/bm_log_
${
max_num_seqs
}
_
${
max_num_batched_tokens
}
_requestrate_
${
request_rate
}
.txt"
vllm bench serve
\
...
...
@@ -204,6 +211,7 @@ run_benchmark() {
--goodput
e2el:
$MAX_LATENCY_ALLOWED_MS
\
--num-prompts
100
\
--random-prefix-len
$prefix_len
\
--host
"
$HOSTNAME
"
\
--port
8004 &>
"
$bm_log
"
throughput
=
$(
grep
"Request throughput (req/s):"
"
$bm_log
"
|
sed
's/[^0-9.]//g'
)
e2el
=
$(
grep
"P99 E2EL (ms):"
"
$bm_log
"
|
awk
'{print $NF}'
)
...
...
@@ -304,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
--goodput
e2el:
$MAX_LATENCY_ALLOWED_MS
\
--num-prompts
100
\
--random-prefix-len
$prefix_len
\
--host
"
$HOSTNAME
"
\
--port
8004
\
--profile
&>
"
$bm_log
"
else
...
...
benchmarks/backend_request_func.py
View file @
a3f8d5dd
...
...
@@ -620,7 +620,7 @@ def get_tokenizer(
kwargs
[
"use_fast"
]
=
False
if
tokenizer_mode
==
"mistral"
:
try
:
from
vllm.tokenizers
import
MistralTokenizer
from
vllm.tokenizers
.mistral
import
MistralTokenizer
except
ImportError
as
e
:
raise
ImportError
(
"MistralTokenizer requires vllm package.
\n
"
...
...
benchmarks/benchmark_ngram_proposer.py
View file @
a3f8d5dd
...
...
@@ -32,12 +32,11 @@ def benchmark_propose(args):
model_config
=
ModelConfig
(
model
=
"facebook/opt-125m"
,
task
=
"generate"
,
max_model_len
=
args
.
num_token
+
args
.
num_spec_token
,
tokenizer
=
"facebook/opt-125m"
,
tokenizer_mode
=
"auto"
,
dtype
=
"auto"
,
seed
=
None
,
seed
=
0
,
trust_remote_code
=
False
,
)
proposer
=
NgramProposer
(
...
...
benchmarks/benchmark_serving_structured_output.py
View file @
a3f8d5dd
...
...
@@ -574,7 +574,7 @@ async def benchmark(
)
print
(
"{:<40} {:<10.2f}"
.
format
(
"Total
T
oken throughput (tok/s):"
,
metrics
.
total_token_throughput
"Total
t
oken throughput (tok/s):"
,
metrics
.
total_token_throughput
)
)
...
...
benchmarks/kernels/benchmark_mla_k_concat.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
in MLA (Multi-head Latent Attention) prefill.
This validates that the optimization from commit 8d4142bd is beneficial across
various batch sizes, not just the originally tested batch size of 32768.
"""
import
time
from
collections.abc
import
Callable
import
torch
# DeepSeek-V3 MLA dimensions
NUM_HEADS
=
128
QK_NOPE_HEAD_DIM
=
128
PE_DIM
=
64
def
cat_method
(
k_nope
:
torch
.
Tensor
,
k_pe
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Original torch.cat approach with expand."""
return
torch
.
cat
((
k_nope
,
k_pe
.
expand
((
*
k_nope
.
shape
[:
-
1
],
-
1
))),
dim
=-
1
)
def
direct_copy_method
(
k_nope
:
torch
.
Tensor
,
k_pe
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Optimized direct copy approach (avoids expand + cat overhead)."""
k
=
torch
.
empty
(
(
*
k_nope
.
shape
[:
-
1
],
k_nope
.
shape
[
-
1
]
+
k_pe
.
shape
[
-
1
]),
dtype
=
k_nope
.
dtype
,
device
=
k_nope
.
device
,
)
k
[...,
:
k_nope
.
shape
[
-
1
]]
=
k_nope
k
[...,
k_nope
.
shape
[
-
1
]
:]
=
k_pe
return
k
def
benchmark_method
(
method
:
Callable
,
k_nope
:
torch
.
Tensor
,
k_pe
:
torch
.
Tensor
,
num_warmup
:
int
=
10
,
num_iters
:
int
=
100
,
)
->
float
:
"""Benchmark a concatenation method and return mean latency in ms."""
# Warmup
for
_
in
range
(
num_warmup
):
_
=
method
(
k_nope
,
k_pe
)
torch
.
cuda
.
synchronize
()
# Benchmark
start
=
time
.
perf_counter
()
for
_
in
range
(
num_iters
):
_
=
method
(
k_nope
,
k_pe
)
torch
.
cuda
.
synchronize
()
end
=
time
.
perf_counter
()
return
(
end
-
start
)
/
num_iters
*
1000
# Convert to ms
@
torch
.
inference_mode
()
def
run_benchmark
(
dtype
:
torch
.
dtype
,
dtype_name
:
str
):
"""Run benchmark for a specific dtype."""
torch
.
set_default_device
(
"cuda"
)
# Batch sizes to test (powers of 2 from 32 to 65536)
batch_sizes
=
[
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
]
print
(
"="
*
80
)
print
(
"Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation"
)
print
(
"="
*
80
)
print
(
f
"Tensor shapes: k_nope=[B,
{
NUM_HEADS
}
,
{
QK_NOPE_HEAD_DIM
}
], "
f
"k_pe=[B, 1,
{
PE_DIM
}
]"
)
print
(
f
"dtype:
{
dtype_name
}
"
)
print
()
print
(
f
"
{
'Batch Size'
:
>
12
}
|
{
'cat (ms)'
:
>
10
}
|
{
'direct (ms)'
:
>
12
}
| "
f
"
{
'Speedup'
:
>
8
}
|
{
'Reduction'
:
>
10
}
"
)
print
(
"-"
*
70
)
results
=
[]
for
batch_size
in
batch_sizes
:
# Create input tensors (generate in float32 then convert for FP8 compatibility)
k_nope
=
torch
.
randn
(
batch_size
,
NUM_HEADS
,
QK_NOPE_HEAD_DIM
,
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
dtype
)
k_pe
=
torch
.
randn
(
batch_size
,
1
,
PE_DIM
,
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
dtype
)
# Benchmark both methods
cat_time
=
benchmark_method
(
cat_method
,
k_nope
,
k_pe
)
direct_time
=
benchmark_method
(
direct_copy_method
,
k_nope
,
k_pe
)
speedup
=
cat_time
/
direct_time
reduction
=
(
1
-
direct_time
/
cat_time
)
*
100
results
.
append
((
batch_size
,
cat_time
,
direct_time
,
speedup
,
reduction
))
print
(
f
"
{
batch_size
:
>
12
}
|
{
cat_time
:
>
10.3
f
}
|
{
direct_time
:
>
12.3
f
}
| "
f
"
{
speedup
:
>
7.2
f
}
x |
{
reduction
:
>
9.1
f
}
%"
)
print
(
"="
*
80
)
# Summary statistics
speedups
=
[
r
[
3
]
for
r
in
results
]
print
(
"
\n
Speedup summary:"
)
print
(
f
" Min:
{
min
(
speedups
):.
2
f
}
x"
)
print
(
f
" Max:
{
max
(
speedups
):.
2
f
}
x"
)
print
(
f
" Mean:
{
sum
(
speedups
)
/
len
(
speedups
):.
2
f
}
x"
)
# Find crossover point
crossover_batch
=
None
for
batch_size
,
_
,
_
,
speedup
,
_
in
results
:
if
speedup
>=
1.0
:
crossover_batch
=
batch_size
break
print
(
"
\n
Conclusion:"
)
if
crossover_batch
:
print
(
f
" - Direct copy becomes beneficial at batch size >=
{
crossover_batch
}
"
)
# Filter for large batches (>= 512 which is typical for prefill)
large_batch_speedups
=
[
r
[
3
]
for
r
in
results
if
r
[
0
]
>=
512
]
if
large_batch_speedups
:
avg_large
=
sum
(
large_batch_speedups
)
/
len
(
large_batch_speedups
)
print
(
f
" - For batch sizes >= 512: avg speedup =
{
avg_large
:.
2
f
}
x"
)
print
(
" - MLA prefill typically uses large batches, so optimization is effective"
)
return
results
@
torch
.
inference_mode
()
def
main
():
# Test bfloat16
print
(
"
\n
"
)
run_benchmark
(
torch
.
bfloat16
,
"bfloat16"
)
# Test float8_e4m3fn
print
(
"
\n
"
)
run_benchmark
(
torch
.
float8_e4m3fn
,
"float8_e4m3fn"
)
if
__name__
==
"__main__"
:
main
()
benchmarks/kernels/benchmark_mrope.py
View file @
a3f8d5dd
...
...
@@ -99,7 +99,6 @@ def benchmark_mrope(
# the parameters to compute the q k v size based on tp_size
mrope_helper_class
=
get_rope
(
head_size
=
head_dim
,
rotary_dim
=
head_dim
,
max_position
=
max_position
,
is_neox_style
=
is_neox_style
,
rope_parameters
=
rope_parameters
,
...
...
benchmarks/kernels/benchmark_rope.py
View file @
a3f8d5dd
...
...
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
def
benchmark
(
batch_size
,
seq_len
,
num_heads
,
provider
):
dtype
=
torch
.
bfloat16
max_position
=
8192
base
=
10000
rope
=
get_rope
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
)
rope_parameters
=
{
"partial_rotary_factor"
:
rotary_dim
/
head_size
}
rope
=
get_rope
(
head_size
,
max_position
,
is_neox_style
,
rope_parameters
)
rope
=
rope
.
to
(
dtype
=
dtype
,
device
=
device
)
cos_sin_cache
=
rope
.
cos_sin_cache
.
to
(
dtype
=
torch
.
float
,
device
=
device
)
...
...
cmake/cpu_extension.cmake
View file @
a3f8d5dd
...
...
@@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif
()
# Build ACL with CMake
set
(
ARM_COMPUTE_BUILD_SHARED_LIB
"OFF"
)
set
(
CMAKE_BUILD_TYPE
"Release"
)
set
(
ARM_COMPUTE_ARCH
"armv8.2-a"
)
set
(
ARM_COMPUTE_ENABLE_ASSERTS
"OFF"
)
set
(
ARM_COMPUTE_ENABLE_CPPTHREADS
"OFF"
)
set
(
ONEDNN_ENABLE_PRIMITIVE
"MATMUL;REORDER"
)
set
(
ARM_COMPUTE_ENABLE_OPENMP
"ON"
)
set
(
ARM_COMPUTE_ENABLE_WERROR
"OFF"
)
set
(
ARM_COMPUTE_BUILD_EXAMPLES
"OFF"
)
set
(
ARM_COMPUTE_BUILD_TESTING
"OFF"
)
set
(
_cmake_config_cmd
${
CMAKE_COMMAND
}
-G Ninja -B build
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
...
...
Prev
1
2
3
4
5
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment