Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
45a060d6
Commit
45a060d6
authored
Feb 05, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.15.1' into v0.15.1-dev
parents
99fc9fc3
1892993b
Changes
64
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
413 additions
and
204 deletions
+413
-204
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+2
-2
.buildkite/scripts/annotate-release.sh
.buildkite/scripts/annotate-release.sh
+39
-17
.buildkite/scripts/upload-release-wheels-pypi.sh
.buildkite/scripts/upload-release-wheels-pypi.sh
+10
-44
README.md
README.md
+9
-21
docs/models/supported_models.md
docs/models/supported_models.md
+1
-0
examples/pooling/score/vision_rerank_api_online.py
examples/pooling/score/vision_rerank_api_online.py
+46
-49
examples/pooling/score/vision_score_api_online.py
examples/pooling/score/vision_score_api_online.py
+54
-45
requirements/build.txt
requirements/build.txt
+1
-1
requirements/common.txt
requirements/common.txt
+2
-2
requirements/kv_connectors.txt
requirements/kv_connectors.txt
+1
-1
requirements/rocm-test.txt
requirements/rocm-test.txt
+1
-1
requirements/test.txt
requirements/test.txt
+1
-1
setup.py
setup.py
+3
-3
tests/compile/test_cold_start.py
tests/compile/test_cold_start.py
+48
-0
tests/compile/test_graph_partition.py
tests/compile/test_graph_partition.py
+62
-0
tests/entrypoints/pooling/classify/test_online_vision.py
tests/entrypoints/pooling/classify/test_online_vision.py
+2
-2
tests/entrypoints/pooling/score/test_online_score_vision.py
tests/entrypoints/pooling/score/test_online_score_vision.py
+122
-0
tests/entrypoints/test_utils.py
tests/entrypoints/test_utils.py
+0
-12
tests/kernels/core/test_activation.py
tests/kernels/core/test_activation.py
+8
-2
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe.py
+1
-1
No files found.
.buildkite/release-pipeline.yaml
View file @
45a060d6
...
@@ -274,14 +274,14 @@ steps:
...
@@ -274,14 +274,14 @@ steps:
-
input-release-version
-
input-release-version
-
build-wheels
-
build-wheels
-
label
:
"
Upload
release
wheels
to
PyPI
and
GitHub
"
-
label
:
"
Upload
release
wheels
to
PyPI"
depends_on
:
depends_on
:
-
block-upload-release-wheels
-
block-upload-release-wheels
id
:
upload-release-wheels
id
:
upload-release-wheels
agents
:
agents
:
queue
:
small_cpu_queue_postmerge
queue
:
small_cpu_queue_postmerge
commands
:
commands
:
-
"
bash
.buildkite/scripts/upload-release-wheels.sh"
-
"
bash
.buildkite/scripts/upload-release-wheels
-pypi
.sh"
# =============================================================================
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
# ROCm Release Pipeline (x86_64 only)
...
...
.buildkite/scripts/annotate-release.sh
View file @
45a060d6
...
@@ -11,58 +11,80 @@ fi
...
@@ -11,58 +11,80 @@ fi
buildkite-agent annotate
--style
'info'
--context
'release-workflow'
<<
EOF
buildkite-agent annotate
--style
'info'
--context
'release-workflow'
<<
EOF
To download the wheel (by commit):
To download the wheel (by commit):
\`\`\`
\`\`\`
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux
_2_3
1_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux
2014
_aarch64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux
_2_31
_aarch64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cu129-cp38-abi3-manylinux1_x86_64.whl .
(Optional) For CUDA 13.0:
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cu1
29
-cp38-abi3-manylinux
1
_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cu1
30
-cp38-abi3-manylinux
_2_35
_x86_64.whl .
\`\`\`
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
To download the wheel (by version):
(Optional) For CPU:
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
\`\`\`
\`\`\`
aws s3 cp s3://vllm-wheels/
${
RELEASE_VERSION
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
RELEASE_VERSION
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/
${
RELEASE_VERSION
}
+cu129/vllm-
${
RELEASE_VERSION
}
+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
RELEASE_VERSION
}
+cu130/vllm-
${
RELEASE_VERSION
}
+cu130-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
To download and upload the image:
\`\`\`
\`\`\`
Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm
Tag and push images:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64 vllm/vllm-openai:x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64 vllm/vllm-openai:aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64-cu130
docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-rocm
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-rocm vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-rocm vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-rocm
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-rocm
Create multi-arch manifest:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm-base vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-base
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm-base vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-base
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-base vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-base
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-base vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-base
docker push vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-base
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
docker manifest rm vllm/vllm-openai:latest
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v
${
RELEASE_VERSION
}
vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64
docker manifest create vllm/vllm-openai:v
${
RELEASE_VERSION
}
vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v
${
RELEASE_VERSION
}
docker manifest push vllm/vllm-openai:v
${
RELEASE_VERSION
}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v
${
RELEASE_VERSION
}
-cu130 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64-cu130 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-cu130
\`\`\`
\`\`\`
EOF
EOF
.buildkite/scripts/upload-release-wheels.sh
→
.buildkite/scripts/upload-release-wheels
-pypi
.sh
View file @
45a060d6
...
@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
...
@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX
=
"s3://
$BUCKET
/
$SUBPATH
/"
S3_COMMIT_PREFIX
=
"s3://
$BUCKET
/
$SUBPATH
/"
RELEASE_VERSION
=
$(
buildkite-agent meta-data get release-version
)
RELEASE_VERSION
=
$(
buildkite-agent meta-data get release-version
)
echo
"Release version from Buildkite:
$RELEASE_VERSION
"
GIT_VERSION
=
$(
git describe
--exact-match
--tags
$BUILDKITE_COMMIT
2>/dev/null
)
GIT_VERSION
=
$(
git describe
--exact-match
--tags
$BUILDKITE_COMMIT
2>/dev/null
)
if
[
-z
"
$GIT_VERSION
"
]
;
then
echo
"Release version from Buildkite:
$RELEASE_VERSION
"
if
[[
-z
"
$GIT_VERSION
"
]]
;
then
echo
"[FATAL] Not on a git tag, cannot create release."
echo
"[FATAL] Not on a git tag, cannot create release."
exit
1
exit
1
else
else
echo
"Git version for commit
$BUILDKITE_COMMIT
:
$GIT_VERSION
"
echo
"Git version for commit
$BUILDKITE_COMMIT
:
$GIT_VERSION
"
fi
fi
# sanity check for version mismatch
# sanity check for version mismatch
if
[
"
$RELEASE_VERSION
"
!=
"
$GIT_VERSION
"
]
;
then
if
[
[
"
$RELEASE_VERSION
"
!=
"
$GIT_VERSION
"
]
]
;
then
if
[
"
$FORCE_RELEASE_IGNORE_VERSION_MISMATCH
"
==
"true"
]
;
then
if
[
[
"
$FORCE_RELEASE_IGNORE_VERSION_MISMATCH
"
==
"true"
]
]
;
then
echo
"[WARNING] Force release and ignore version mismatch"
echo
"[WARNING] Force release and ignore version mismatch"
else
else
echo
"[FATAL] Release version from Buildkite does not match Git version."
echo
"[FATAL] Release version from Buildkite does not match Git version."
...
@@ -27,7 +29,7 @@ fi
...
@@ -27,7 +29,7 @@ fi
PURE_VERSION
=
${
RELEASE_VERSION
#v
}
# remove leading 'v'
PURE_VERSION
=
${
RELEASE_VERSION
#v
}
# remove leading 'v'
# check pypi token
# check pypi token
if
[
-z
"
$PYPI_TOKEN
"
]
;
then
if
[
[
-z
"
$PYPI_TOKEN
"
]
]
;
then
echo
"[FATAL] PYPI_TOKEN is not set."
echo
"[FATAL] PYPI_TOKEN is not set."
exit
1
exit
1
else
else
...
@@ -35,41 +37,8 @@ else
...
@@ -35,41 +37,8 @@ else
export
TWINE_PASSWORD
=
"
$PYPI_TOKEN
"
export
TWINE_PASSWORD
=
"
$PYPI_TOKEN
"
fi
fi
# check github token
if
[
-z
"
$GITHUB_TOKEN
"
]
;
then
echo
"[FATAL] GITHUB_TOKEN is not set."
exit
1
else
export
GH_TOKEN
=
"
$GITHUB_TOKEN
"
fi
set
-x
# avoid printing secrets above
set
-x
# avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION
=
$(
curl
-s
https://api.github.com/repos/cli/cli/releases/latest |
grep
'"tag_name":'
|
sed
-E
's/.*"([^"]+)".*/\1/'
|
sed
's/^v//'
)
if
[
-z
"
$GH_VERSION
"
]
;
then
echo
"[FATAL] Failed to get latest gh CLI version from GitHub"
exit
1
fi
echo
"Downloading gh CLI version:
$GH_VERSION
"
GH_TARBALL
=
"gh_
${
GH_VERSION
}
_linux_amd64.tar.gz"
GH_URL
=
"https://github.com/cli/cli/releases/download/v
${
GH_VERSION
}
/
${
GH_TARBALL
}
"
GH_INSTALL_DIR
=
"/tmp/gh-install"
mkdir
-p
"
$GH_INSTALL_DIR
"
pushd
"
$GH_INSTALL_DIR
"
curl
-L
-o
"
$GH_TARBALL
"
"
$GH_URL
"
tar
-xzf
"
$GH_TARBALL
"
GH_BIN
=
$(
realpath
$(
find
.
-name
"gh"
-type
f
-executable
|
head
-n
1
))
if
[
-z
"
$GH_BIN
"
]
;
then
echo
"[FATAL] Failed to find gh CLI executable"
exit
1
fi
echo
"gh CLI downloaded successfully, version:
$(
$GH_BIN
--version
)
"
echo
"Last 5 releases on GitHub:"
# as a sanity check of gh and GH_TOKEN
command
"
$GH_BIN
"
release list
--limit
5
popd
# install twine from pypi
# install twine from pypi
python3
-m
venv /tmp/vllm-release-env
python3
-m
venv /tmp/vllm-release-env
source
/tmp/vllm-release-env/bin/activate
source
/tmp/vllm-release-env/bin/activate
...
@@ -89,16 +58,13 @@ echo "Wheels copied to local directory"
...
@@ -89,16 +58,13 @@ echo "Wheels copied to local directory"
git archive
--format
=
tar.gz
--output
=
"
$DIST_DIR
/vllm-
${
PURE_VERSION
}
.tar.gz"
$BUILDKITE_COMMIT
git archive
--format
=
tar.gz
--output
=
"
$DIST_DIR
/vllm-
${
PURE_VERSION
}
.tar.gz"
$BUILDKITE_COMMIT
ls
-la
$DIST_DIR
ls
-la
$DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES
=
$(
find
$DIST_DIR
-name
"vllm-
${
PURE_VERSION
}
*.whl"
-not
-name
"*+*"
)
PYPI_WHEEL_FILES
=
$(
find
$DIST_DIR
-name
"vllm-
${
PURE_VERSION
}
*.whl"
-not
-name
"*+*"
)
if
[
-z
"
$PYPI_WHEEL_FILES
"
]
;
then
if
[
[
-z
"
$PYPI_WHEEL_FILES
"
]
]
;
then
echo
"No default variant wheels found, quitting..."
echo
"No default variant wheels found, quitting..."
exit
1
exit
1
fi
fi
python3
-m
twine check
$PYPI_WHEEL_FILES
python3
-m
twine check
$PYPI_WHEEL_FILES
python3
-m
twine
--non-interactive
--verbose
upload
$PYPI_WHEEL_FILES
python3
-m
twine
upload
--non-interactive
--verbose
$PYPI_WHEEL_FILES
echo
"Wheels uploaded to PyPI"
echo
"Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command
"
$GH_BIN
"
release create
$GIT_VERSION
-d
--latest
--notes-from-tag
--verify-tag
$DIST_DIR
/
*
.whl
README.md
View file @
45a060d6
...
@@ -2,10 +2,6 @@
...
@@ -2,10 +2,6 @@
## 简介
## 简介
vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention高效管理kv内存,Continuous batching传入请求,支持很多Hugging Face模型,如LLaMA & LLaMA-2、Qwen、Chatglm2 & Chatglm3等。
vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention高效管理kv内存,Continuous batching传入请求,支持很多Hugging Face模型,如LLaMA & LLaMA-2、Qwen、Chatglm2 & Chatglm3等。
## 暂不支持的官方功能
-
**量化推理**
:目前不支持marlin的权重量化
-
**模块支持**
:目前不支持Sliding window attention
## 支持模型结构列表
## 支持模型结构列表
...
@@ -51,29 +47,23 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
...
@@ -51,29 +47,23 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
| XLMRobertaForSequenceClassification | bge-reranker-v2-m3 | Yes | No | - | v0.7.2 | No |
| XLMRobertaForSequenceClassification | bge-reranker-v2-m3 | Yes | No | - | v0.7.2 | No |
## 安装
vLLM支持
+
Python 3.9.
+
Python 3.10.
+
Python 3.11.
+
Python 3.12.
##
#
使用源码编译方式安装
## 使用源码编译方式安装
###
#
编译环境准备
### 编译环境准备
提供2种环境准备方式:
提供2种环境准备方式:
1.
基于光源pytorch2.9.0
1
基础镜像环境:根据pytorch2.9.0、python、dtk及系统下载对应的镜像版本。
1.
基于光源pytorch2.9.0基础镜像环境:根据pytorch2.9.0、python、dtk及系统下载对应的镜像版本。
2.
基于现有python环境:安装pytorch2.9.0
1
,pytorch whl包下载目录:
[
https://cancon.hpccube.com:65024/4/main/pytorch
](
https://cancon.hpccube.com:65024/4/main/pytorch
)
,根据python、dtk版本,下载对应pytorch2.5.1的whl包。安装命令如下:
2.
基于现有python环境:安装pytorch2.9.0,pytorch whl包下载目录:
[
https://cancon.hpccube.com:65024/4/main/pytorch
](
https://cancon.hpccube.com:65024/4/main/pytorch
)
,根据python、dtk版本,下载对应pytorch2.5.1的whl包。安装命令如下:
```
shell
```
shell
pip
install
torch
*
(
下载的torch的whl包
)
pip
install
torch
*
(
下载的torch的whl包
)
pip
install
setuptools wheel
pip
install
setuptools wheel
```
```
###
#
源码编译安装
### 源码编译安装
```
shell
```
shell
git clone http://
developer.hpccube.com/codes/OpenDAS
/vllm.git
# 根据需要的分支进行切换
git clone http://
10.16.6.30/dcutoolkit/deeplearing
/vllm.git
# 根据需要的分支进行切换
```
```
安装依赖:
安装依赖:
```
shell
```
shell
...
@@ -91,10 +81,8 @@ python3 setup.py install (若调试,可使用python3 setup.py develop)
...
@@ -91,10 +81,8 @@ python3 setup.py install (若调试,可使用python3 setup.py develop)
```
```
若需要添加git号,设置环境变量: export ADD_GIT_VERSION=1
若需要添加git号,设置环境变量: export ADD_GIT_VERSION=1
3.
跳过编译(适用于未改变csrc目录kernel并多次编译情况)
将编译后的so文件拷贝至csrc目录,并设置环境变量: export SKIP_VLLM_BUILD=1
###
#
运行基础环境准备
### 运行基础环境准备
1、使用上面基于光源pytorch2.9.0基础镜像环境
1、使用上面基于光源pytorch2.9.0基础镜像环境
2、根据pytorch2.9.0、python、dtk及系统下载对应的依赖包:
2、根据pytorch2.9.0、python、dtk及系统下载对应的依赖包:
...
@@ -104,11 +92,11 @@ python3 setup.py install (若调试,可使用python3 setup.py develop)
...
@@ -104,11 +92,11 @@ python3 setup.py install (若调试,可使用python3 setup.py develop)
-
lightop:
[
https://cancon.hpccube.com:65024/4/main/lightop
](
https://cancon.hpccube.com:65024/4/main/lightop
)
-
lightop:
[
https://cancon.hpccube.com:65024/4/main/lightop
](
https://cancon.hpccube.com:65024/4/main/lightop
)
-
lmslim:
[
https://cancon.hpccube.com:65024/4/main/lmslim
](
https://cancon.hpccube.com:65024/4/main/lmslim
)
-
lmslim:
[
https://cancon.hpccube.com:65024/4/main/lmslim
](
https://cancon.hpccube.com:65024/4/main/lmslim
)
###
#
注意事项
### 注意事项
+
若使用 pip install 下载安装过慢,可添加源: -i https://pypi.tuna.tsinghua.edu.cn/simple/
+
若使用 pip install 下载安装过慢,可添加源: -i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证
## 验证
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.15.
0
;
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.15.
1
;
## Known Issue
## Known Issue
-
无
-
无
...
...
docs/models/supported_models.md
View file @
45a060d6
...
@@ -456,6 +456,7 @@ th {
...
@@ -456,6 +456,7 @@ th {
|
`StableLmForCausalLM`
| StableLM |
`stabilityai/stablelm-3b-4e1t`
,
`stabilityai/stablelm-base-alpha-7b-v2`
, etc. | | |
|
`StableLmForCausalLM`
| StableLM |
`stabilityai/stablelm-3b-4e1t`
,
`stabilityai/stablelm-base-alpha-7b-v2`
, etc. | | |
|
`Starcoder2ForCausalLM`
| Starcoder2 |
`bigcode/starcoder2-3b`
,
`bigcode/starcoder2-7b`
,
`bigcode/starcoder2-15b`
, etc. | | ✅︎ |
|
`Starcoder2ForCausalLM`
| Starcoder2 |
`bigcode/starcoder2-3b`
,
`bigcode/starcoder2-7b`
,
`bigcode/starcoder2-15b`
, etc. | | ✅︎ |
|
`Step1ForCausalLM`
| Step-Audio |
`stepfun-ai/Step-Audio-EditX`
, etc. | ✅︎ | ✅︎ |
|
`Step1ForCausalLM`
| Step-Audio |
`stepfun-ai/Step-Audio-EditX`
, etc. | ✅︎ | ✅︎ |
|
`Step3p5ForCausalLM`
| Step-3.5-flash |
`stepfun-ai/step-3.5-flash`
, etc. | | ✅︎ |
|
`TeleChat2ForCausalLM`
| TeleChat2 |
`Tele-AI/TeleChat2-3B`
,
`Tele-AI/TeleChat2-7B`
,
`Tele-AI/TeleChat2-35B`
, etc. | ✅︎ | ✅︎ |
|
`TeleChat2ForCausalLM`
| TeleChat2 |
`Tele-AI/TeleChat2-3B`
,
`Tele-AI/TeleChat2-7B`
,
`Tele-AI/TeleChat2-35B`
, etc. | ✅︎ | ✅︎ |
|
`TeleFLMForCausalLM`
| TeleFLM |
`CofeAI/FLM-2-52B-Instruct-2407`
,
`CofeAI/Tele-FLM`
, etc. | ✅︎ | ✅︎ |
|
`TeleFLMForCausalLM`
| TeleFLM |
`CofeAI/FLM-2-52B-Instruct-2407`
,
`CofeAI/Tele-FLM`
, etc. | ✅︎ | ✅︎ |
|
`XverseForCausalLM`
| XVERSE |
`xverse/XVERSE-7B-Chat`
,
`xverse/XVERSE-13B-Chat`
,
`xverse/XVERSE-65B-Chat`
, etc. | ✅︎ | ✅︎ |
|
`XverseForCausalLM`
| XVERSE |
`xverse/XVERSE-7B-Chat`
,
`xverse/XVERSE-13B-Chat`
,
`xverse/XVERSE-65B-Chat`
, etc. | ✅︎ | ✅︎ |
...
...
examples/pooling/score/vision_rerank_api_online.py
View file @
45a060d6
...
@@ -18,48 +18,32 @@ e.g.
...
@@ -18,48 +18,32 @@ e.g.
"""
"""
import
argparse
import
argparse
import
base64
import
pprint
import
json
import
requests
import
requests
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
def
encode_base64_content_from_url
(
content_url
:
str
)
->
dict
[
str
,
str
]:
"""Encode a content retrieved from a remote url to base64 format."""
with
requests
.
get
(
content_url
,
headers
=
headers
)
as
response
:
response
.
raise_for_status
()
result
=
base64
.
b64encode
(
response
.
content
).
decode
(
"utf-8"
)
return
{
"url"
:
f
"data:image/jpeg;base64,
{
result
}
"
}
headers
=
{
"accept"
:
"application/json"
,
"Content-Type"
:
"application/json"
}
query
=
"A woman playing with her dog on a beach at sunset."
query
=
"A woman playing with her dog on a beach at sunset."
documents
=
{
document
=
(
"content"
:
[
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
{
"as the dog offers its paw in a heartwarming display of companionship and trust."
"type"
:
"text"
,
)
"text"
:
(
image_url
=
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
documents
=
[
"as the dog offers its paw in a heartwarming display of companionship and trust."
{
),
"type"
:
"text"
,
},
"text"
:
document
,
{
},
"type"
:
"image_url"
,
{
"image_url"
:
{
"type"
:
"image_url"
,
"url"
:
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
"image_url"
:
{
"url"
:
image_url
},
},
},
},
{
{
"type"
:
"image_url"
,
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
encode_image_url
(
fetch_image
(
image_url
))},
"image_url"
:
encode_base64_content_from_url
(
},
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
]
),
},
]
}
def
parse_args
():
def
parse_args
():
...
@@ -74,23 +58,36 @@ def main(args):
...
@@ -74,23 +58,36 @@ def main(args):
models_url
=
base_url
+
"/v1/models"
models_url
=
base_url
+
"/v1/models"
rerank_url
=
base_url
+
"/rerank"
rerank_url
=
base_url
+
"/rerank"
response
=
requests
.
get
(
models_url
,
headers
=
headers
)
response
=
requests
.
get
(
models_url
)
model
=
response
.
json
()[
"data"
][
0
][
"id"
]
model
=
response
.
json
()[
"data"
][
0
][
"id"
]
data
=
{
print
(
"Query: string & Document: list of string"
)
prompt
=
{
"model"
:
model
,
"query"
:
query
,
"documents"
:
[
document
]}
response
=
requests
.
post
(
rerank_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: text"
)
prompt
=
{
"model"
:
model
,
"query"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
0
]]}}
response
=
requests
.
post
(
rerank_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: image url"
)
prompt
=
{
"model"
:
model
,
"query"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
1
]]},
}
response
=
requests
.
post
(
rerank_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: image base64"
)
prompt
=
{
"model"
:
model
,
"model"
:
model
,
"query"
:
query
,
"query"
:
query
,
"documents"
:
documents
,
"documents"
:
{
"content"
:
[
documents
[
2
]]}
,
}
}
response
=
requests
.
post
(
rerank_url
,
headers
=
headers
,
json
=
data
)
response
=
requests
.
post
(
rerank_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
# Check the response
if
response
.
status_code
==
200
:
print
(
"Request successful!"
)
print
(
json
.
dumps
(
response
.
json
(),
indent
=
2
))
else
:
print
(
f
"Request failed with status code:
{
response
.
status_code
}
"
)
print
(
response
.
text
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
examples/pooling/score/vision_score_api_online.py
View file @
45a060d6
...
@@ -17,48 +17,32 @@ e.g.
...
@@ -17,48 +17,32 @@ e.g.
"""
"""
import
argparse
import
argparse
import
base64
import
json
import
pprint
import
pprint
import
requests
import
requests
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
def
encode_base64_content_from_url
(
content_url
:
str
)
->
dict
[
str
,
str
]:
"""Encode a content retrieved from a remote url to base64 format."""
query
=
"A woman playing with her dog on a beach at sunset."
document
=
(
with
requests
.
get
(
content_url
,
headers
=
headers
)
as
response
:
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
response
.
raise_for_status
()
"as the dog offers its paw in a heartwarming display of companionship and trust."
result
=
base64
.
b64encode
(
response
.
content
).
decode
(
"utf-8"
)
)
image_url
=
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
return
{
"url"
:
f
"data:image/jpeg;base64,
{
result
}
"
}
documents
=
[
{
"type"
:
"text"
,
headers
=
{
"accept"
:
"application/json"
,
"Content-Type"
:
"application/json"
}
"text"
:
document
,
},
queries
=
"slm markdown"
{
documents
=
{
"type"
:
"image_url"
,
"content"
:
[
"image_url"
:
{
"url"
:
image_url
},
{
},
"type"
:
"image_url"
,
{
"image_url"
:
{
"type"
:
"image_url"
,
"url"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
"image_url"
:
{
"url"
:
encode_image_url
(
fetch_image
(
image_url
))},
},
},
},
]
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
},
{
"type"
:
"image_url"
,
"image_url"
:
encode_base64_content_from_url
(
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
),
},
]
}
def
parse_args
():
def
parse_args
():
...
@@ -73,15 +57,40 @@ def main(args):
...
@@ -73,15 +57,40 @@ def main(args):
models_url
=
base_url
+
"/v1/models"
models_url
=
base_url
+
"/v1/models"
score_url
=
base_url
+
"/score"
score_url
=
base_url
+
"/score"
response
=
requests
.
get
(
models_url
,
headers
=
headers
)
response
=
requests
.
get
(
models_url
)
model
=
response
.
json
()[
"data"
][
0
][
"id"
]
model
=
response
.
json
()[
"data"
][
0
][
"id"
]
prompt
=
{
"model"
:
model
,
"queries"
:
queries
,
"documents"
:
documents
}
print
(
"Query: string & Document: string"
)
response
=
requests
.
post
(
score_url
,
headers
=
headers
,
json
=
prompt
)
prompt
=
{
"model"
:
model
,
"queries"
:
query
,
"documents"
:
document
}
print
(
"
\n
Prompt when queries is string and documents is a image list:"
)
response
=
requests
.
post
(
score_url
,
json
=
prompt
)
pprint
.
pprint
(
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"
\n
Score Response:"
)
print
(
json
.
dumps
(
response
.
json
(),
indent
=
2
))
print
(
"Query: string & Document: text"
)
prompt
=
{
"model"
:
model
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
0
]]},
}
response
=
requests
.
post
(
score_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: image url"
)
prompt
=
{
"model"
:
model
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
1
]]},
}
response
=
requests
.
post
(
score_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: image base64"
)
prompt
=
{
"model"
:
model
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
2
]]},
}
response
=
requests
.
post
(
score_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
requirements/build.txt
View file @
45a060d6
...
@@ -9,5 +9,5 @@ wheel
...
@@ -9,5 +9,5 @@ wheel
jinja2>=3.1.6
jinja2>=3.1.6
regex
regex
build
build
protobuf
protobuf
>= 6.33.5
grpcio-tools
grpcio-tools
requirements/common.txt
View file @
45a060d6
...
@@ -9,9 +9,9 @@ blake3
...
@@ -9,9 +9,9 @@ blake3
py-cpuinfo
py-cpuinfo
transformers >= 4.56.0, < 5
transformers >= 4.56.0, < 5
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer, gRPC.
protobuf
>= 6.33.5
# Required by LlamaTokenizer, gRPC.
CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
aiohttp
>= 3.13.3
openai >= 1.99.1 # For Responses API with reasoning content
openai >= 1.99.1 # For Responses API with reasoning content
pydantic >= 2.12.0
pydantic >= 2.12.0
prometheus_client >= 0.18.0
prometheus_client >= 0.18.0
...
...
requirements/kv_connectors.txt
View file @
45a060d6
lmcache
lmcache
>= 0.3.9
nixl >= 0.7.1 # Required for disaggregated prefill
nixl >= 0.7.1 # Required for disaggregated prefill
requirements/rocm-test.txt
View file @
45a060d6
...
@@ -14,7 +14,7 @@ pytest-shard==0.1.2
...
@@ -14,7 +14,7 @@ pytest-shard==0.1.2
# Async/HTTP dependencies
# Async/HTTP dependencies
anyio==4.6.2.post1
anyio==4.6.2.post1
# via httpx, starlette
# via httpx, starlette
aiohttp==3.13.
0
aiohttp==3.13.
3
# via gpt-oss
# via gpt-oss
httpx==0.27.2
httpx==0.27.2
# HTTP testing
# HTTP testing
...
...
requirements/test.txt
View file @
45a060d6
...
@@ -12,7 +12,7 @@ affine==2.4.0
...
@@ -12,7 +12,7 @@ affine==2.4.0
# via rasterio
# via rasterio
aiohappyeyeballs==2.6.1
aiohappyeyeballs==2.6.1
# via aiohttp
# via aiohttp
aiohttp==3.13.
0
aiohttp==3.13.
3
# via
# via
# aiohttp-cors
# aiohttp-cors
# datasets
# datasets
...
...
setup.py
View file @
45a060d6
...
@@ -949,9 +949,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
...
@@ -949,9 +949,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content
=
f
"""
new_version_content
=
f
"""
try:
try:
__version__ = "0.15.
0
"
__version__ = "0.15.
1
"
__version_tuple__ = (0, 15,
0
)
__version_tuple__ = (0, 15,
1
)
__hcu_version__ = f'0.15.
0
+
{
version
}
'
__hcu_version__ = f'0.15.
1
+
{
version
}
'
from vllm.version import __version__, __version_tuple__, __hcu_version__
from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e:
except Exception as e:
...
...
tests/compile/test_cold_start.py
0 → 100644
View file @
45a060d6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
torch._dynamo.utils
import
counters
from
vllm
import
LLM
from
vllm.config
import
CompilationConfig
,
CompilationMode
,
CUDAGraphMode
def
test_moe_compilation_cold_start
(
monkeypatch
,
use_fresh_inductor_cache
):
# Run in same process so we can access PyTorch's internal counters
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
# I'm not sure if this is going to affect the numbers
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"0"
)
# Force cold compilation
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
cudagraph_mode
=
CUDAGraphMode
.
NONE
,
# make the model loading faster
)
counters
.
clear
()
_
=
LLM
(
model
=
"microsoft/Phi-tiny-MoE-instruct"
,
max_model_len
=
256
,
load_format
=
"dummy"
,
# make the model loading faster
compilation_config
=
compilation_config
,
num_gpu_blocks_override
=
8
,
# make the model loading faster
)
# vLLM-compile cold start is special. By default, we do
# one full dynamo capture of the entire forward pass.
# The forward pass consists of 32 transformer layers.
# Then, we split on the attention operation. This results in
# 33 subgraphs (not including the attention operation).
# The 33 subgraphs then get standalone_compile'd.
#
# There are actually only 3 unique subgraphs for this model
# (all of its transformer layers are the same modulo weights);
# this is true for most vLLM models.
# So we test that during cold start, the aot_autograd cache
# misses for 3 subgraphs and hits for the rest.
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
3
assert
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
30
tests/compile/test_graph_partition.py
View file @
45a060d6
...
@@ -8,6 +8,10 @@ import torch
...
@@ -8,6 +8,10 @@ import torch
from
torch.fx.experimental.proxy_tensor
import
make_fx
from
torch.fx.experimental.proxy_tensor
import
make_fx
from
vllm.compilation.backends
import
split_graph
from
vllm.compilation.backends
import
split_graph
from
vllm.compilation.fx_utils
import
find_op_nodes
# This import automatically registers `torch.ops.silly.attention`
from
.
import
silly_attention
# noqa: F401
def
test_getitem_moved_to_producer_subgraph
():
def
test_getitem_moved_to_producer_subgraph
():
...
@@ -122,3 +126,61 @@ def test_no_tuple_inputs_with_multiple_consumers():
...
@@ -122,3 +126,61 @@ def test_no_tuple_inputs_with_multiple_consumers():
output_split
=
split_gm
(
new_x
)
output_split
=
split_gm
(
new_x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
"Output mismatch after split"
assert
torch
.
allclose
(
output_original
,
output_split
),
"Output mismatch after split"
def
test_consecutive_ops_in_split
():
"""
Test that consecutive splitting operations are grouped into the same subgraph
"""
def
model_fn
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
Define a simple model where consecutive operations create opportunities
for splitting subgraphs.
"""
# Apply silly attention followed by consecutive operations
intermediate
=
torch
.
relu
(
x
)
attn_inout
=
torch
.
sqrt
(
intermediate
)
torch
.
ops
.
silly
.
attention
(
intermediate
,
intermediate
,
attn_inout
,
attn_inout
)
final_result
=
torch
.
sigmoid
(
attn_inout
)
return
final_result
torch
.
set_default_device
(
"cuda"
)
# Create the traced FX graph for the model
x
=
torch
.
randn
(
8
,
4
)
gm
=
make_fx
(
model_fn
)(
x
)
# Assert presence of the expected operations in the setup
assert
(
len
(
list
(
find_op_nodes
(
torch
.
ops
.
aten
.
relu
,
gm
.
graph
)))
==
1
and
len
(
list
(
find_op_nodes
(
torch
.
ops
.
aten
.
sqrt
,
gm
.
graph
)))
==
1
),
"Test setup failed: Expected sqrt and relu operations in the graph."
# Configure split operations to test
splitting_ops
=
[
"silly::attention"
,
"aten::sqrt"
]
split_gm
,
split_items
=
split_graph
(
gm
,
splitting_ops
)
# Validate the number of partitions
assert
len
(
split_items
)
==
3
,
(
"Consecutive splitting operations were not grouped correctly."
)
# Validate that correctness is preserved
new_x
=
torch
.
randn
(
8
,
4
)
output_original
=
gm
(
new_x
)
output_split
=
split_gm
(
new_x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
(
"Output mismatch after splitting."
)
# Check the splitting item has 2 nodes exactly (relu and attn)
splitting_items
=
list
(
s
for
s
in
split_items
if
s
.
is_splitting_graph
)
assert
len
(
splitting_items
)
==
1
,
"Expecting a single splitting graph"
print
(
splitting_items
[
0
].
graph
.
graph
)
splitting_gm
=
splitting_items
[
0
].
graph
assert
len
(
splitting_gm
.
graph
.
nodes
)
==
4
,
"Expecting 4 nodes in splitting graph"
assert
[
node
.
op
for
node
in
splitting_gm
.
graph
.
nodes
]
==
[
"placeholder"
]
+
2
*
[
"call_function"
]
+
[
"output"
]
tests/entrypoints/pooling/classify/test_online_vision.py
View file @
45a060d6
...
@@ -5,9 +5,9 @@ import json
...
@@ -5,9 +5,9 @@ import json
import
pytest
import
pytest
import
requests
import
requests
from
tests.entrypoints.test_utils
import
encode_base64_content_from_url
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm.entrypoints.pooling.classify.protocol
import
ClassificationResponse
from
vllm.entrypoints.pooling.classify.protocol
import
ClassificationResponse
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
MODEL_NAME
=
"muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
MODEL_NAME
=
"muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
MAXIMUM_VIDEOS
=
1
MAXIMUM_VIDEOS
=
1
...
@@ -19,7 +19,7 @@ HF_OVERRIDES = {
...
@@ -19,7 +19,7 @@ HF_OVERRIDES = {
}
}
input_text
=
"This product was excellent and exceeded my expectations"
input_text
=
"This product was excellent and exceeded my expectations"
image_url
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
image_url
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
image_base64
=
encode_base64_content_from_url
(
image_url
)
image_base64
=
{
"url"
:
encode_image_url
(
fetch_image
(
image_url
)
)}
video_url
=
"https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
video_url
=
"https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
...
...
tests/entrypoints/pooling/score/test_online_score_vision.py
0 → 100644
View file @
45a060d6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
requests
from
tests.utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
vllm.entrypoints.pooling.score.protocol
import
ScoreResponse
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
MODEL_NAME
=
"Qwen/Qwen3-VL-Reranker-2B"
HF_OVERRIDES
=
{
"architectures"
:
[
"Qwen3VLForSequenceClassification"
],
"classifier_from_token"
:
[
"no"
,
"yes"
],
"is_original_qwen3_reranker"
:
True
,
}
query
=
"A cat standing in the snow."
image_url
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
documents
=
[
{
"type"
:
"text"
,
"text"
:
query
,
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
encode_image_url
(
fetch_image
(
image_url
))},
},
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"8192"
,
"--chat-template"
,
str
(
VLLM_PATH
/
"examples/pooling/score/template/qwen3_vl_reranker.jinja"
),
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
override_hf_configs
=
HF_OVERRIDES
)
as
remote_server
:
yield
remote_server
def
test_score_api_queries_str_documents_str
(
server
:
RemoteOpenAIServer
):
queries
=
"What is the capital of France?"
documents
=
"The capital of France is Paris."
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
queries
,
"documents"
:
documents
,
},
)
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
1
def
test_score_api_queries_str_documents_text_content
(
server
:
RemoteOpenAIServer
):
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
0
]]},
},
)
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
1
def
test_score_api_queries_str_documents_image_url_content
(
server
:
RemoteOpenAIServer
):
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
1
]]},
},
)
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
1
def
test_score_api_queries_str_documents_image_base64_content
(
server
:
RemoteOpenAIServer
,
):
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
2
]]},
},
)
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
1
tests/entrypoints/test_utils.py
View file @
45a060d6
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
import
requests
from
vllm.entrypoints.utils
import
sanitize_message
from
vllm.entrypoints.utils
import
sanitize_message
...
@@ -12,11 +8,3 @@ def test_sanitize_message():
...
@@ -12,11 +8,3 @@ def test_sanitize_message():
sanitize_message
(
"<_io.BytesIO object at 0x7a95e299e750>"
)
sanitize_message
(
"<_io.BytesIO object at 0x7a95e299e750>"
)
==
"<_io.BytesIO object>"
==
"<_io.BytesIO object>"
)
)
def
encode_base64_content_from_url
(
content_url
:
str
)
->
dict
[
str
,
str
]:
with
requests
.
get
(
content_url
)
as
response
:
response
.
raise_for_status
()
result
=
base64
.
b64encode
(
response
.
content
).
decode
(
"utf-8"
)
return
{
"url"
:
f
"data:image/jpeg;base64,
{
result
}
"
}
tests/kernels/core/test_activation.py
View file @
45a060d6
...
@@ -17,6 +17,8 @@ from vllm.model_executor.layers.activation import (
...
@@ -17,6 +17,8 @@ from vllm.model_executor.layers.activation import (
QuickGELU
,
QuickGELU
,
SiluAndMul
,
SiluAndMul
,
SwigluOAIAndMul
,
SwigluOAIAndMul
,
SwigluStepAndMul
,
swiglustep_and_mul_triton
,
)
)
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.utils.torch_utils
import
set_random_seed
...
@@ -36,6 +38,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
...
@@ -36,6 +38,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
"gelu_tanh"
,
"gelu_tanh"
,
"fatrelu"
,
"fatrelu"
,
"swigluoai_and_mul"
,
"swigluoai_and_mul"
,
"swiglustep_and_mul"
,
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
...
@@ -75,9 +78,12 @@ def test_act_and_mul(
...
@@ -75,9 +78,12 @@ def test_act_and_mul(
elif
activation
==
"swigluoai_and_mul"
:
elif
activation
==
"swigluoai_and_mul"
:
layer
=
SwigluOAIAndMul
()
layer
=
SwigluOAIAndMul
()
fn
=
torch
.
ops
.
_C
.
swigluoai_and_mul
fn
=
torch
.
ops
.
_C
.
swigluoai_and_mul
elif
activation
==
"swiglustep_and_mul"
:
layer
=
SwigluStepAndMul
()
fn
=
swiglustep_and_mul_triton
out
=
layer
(
x
)
out
=
layer
(
x
)
ref_out
=
layer
.
forward_native
(
x
)
ref_out
=
layer
.
forward_native
(
x
)
if
activation
==
"swigluoai_and_mul"
:
if
activation
in
[
"swigluoai_and_mul"
,
"swiglustep_and_mul"
]
:
rtol
=
{
rtol
=
{
# For fp16, change the relative tolerance from 1e-3 to 2e-3
# For fp16, change the relative tolerance from 1e-3 to 2e-3
torch
.
float16
:
2e-3
,
torch
.
float16
:
2e-3
,
...
@@ -104,7 +110,7 @@ def test_act_and_mul(
...
@@ -104,7 +110,7 @@ def test_act_and_mul(
opcheck
(
fn
,
(
out
,
x
,
threshold
))
opcheck
(
fn
,
(
out
,
x
,
threshold
))
elif
activation
==
"swigluoai_and_mul"
:
elif
activation
==
"swigluoai_and_mul"
:
opcheck
(
fn
,
(
out
,
x
,
layer
.
alpha
,
layer
.
limit
))
opcheck
(
fn
,
(
out
,
x
,
layer
.
alpha
,
layer
.
limit
))
el
se
:
el
if
activation
!=
"swiglustep_and_mul"
:
opcheck
(
fn
,
(
out
,
x
))
opcheck
(
fn
,
(
out
,
x
))
...
...
tests/kernels/moe/test_moe.py
View file @
45a060d6
...
@@ -722,7 +722,7 @@ def test_mixtral_moe(
...
@@ -722,7 +722,7 @@ def test_mixtral_moe(
# need to override the forward context for unittests, otherwise it assumes
# need to override the forward context for unittests, otherwise it assumes
# we're running the model forward pass (the model specified in vllm_config)
# we're running the model forward pass (the model specified in vllm_config)
get_forward_context
().
remaining
_moe_layers
=
None
get_forward_context
().
all
_moe_layers
=
None
# Run forward passes for both MoE blocks
# Run forward passes for both MoE blocks
hf_states
,
_
=
hf_moe
.
forward
(
hf_inputs
)
hf_states
,
_
=
hf_moe
.
forward
(
hf_inputs
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment