Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fc7980db
Commit
fc7980db
authored
Feb 05, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.15.1' into v0.15.1-ori
parents
3eab7fef
1892993b
Changes
62
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
425 additions
and
182 deletions
+425
-182
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+2
-2
.buildkite/scripts/annotate-release.sh
.buildkite/scripts/annotate-release.sh
+39
-17
.buildkite/scripts/upload-release-wheels-pypi.sh
.buildkite/scripts/upload-release-wheels-pypi.sh
+10
-44
docs/models/supported_models.md
docs/models/supported_models.md
+1
-0
examples/pooling/score/vision_rerank_api_online.py
examples/pooling/score/vision_rerank_api_online.py
+46
-49
examples/pooling/score/vision_score_api_online.py
examples/pooling/score/vision_score_api_online.py
+54
-45
requirements/build.txt
requirements/build.txt
+1
-1
requirements/common.txt
requirements/common.txt
+2
-2
requirements/kv_connectors.txt
requirements/kv_connectors.txt
+1
-1
requirements/rocm-test.txt
requirements/rocm-test.txt
+1
-1
requirements/test.txt
requirements/test.txt
+1
-1
tests/compile/test_cold_start.py
tests/compile/test_cold_start.py
+48
-0
tests/compile/test_graph_partition.py
tests/compile/test_graph_partition.py
+62
-0
tests/entrypoints/pooling/classify/test_online_vision.py
tests/entrypoints/pooling/classify/test_online_vision.py
+2
-2
tests/entrypoints/pooling/score/test_online_score_vision.py
tests/entrypoints/pooling/score/test_online_score_vision.py
+122
-0
tests/entrypoints/test_utils.py
tests/entrypoints/test_utils.py
+0
-12
tests/kernels/core/test_activation.py
tests/kernels/core/test_activation.py
+8
-2
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe.py
+1
-1
tests/kernels/quantization/test_rocm_skinny_gemms.py
tests/kernels/quantization/test_rocm_skinny_gemms.py
+15
-2
tests/models/registry.py
tests/models/registry.py
+9
-0
No files found.
.buildkite/release-pipeline.yaml
View file @
fc7980db
...
...
@@ -274,14 +274,14 @@ steps:
-
input-release-version
-
build-wheels
-
label
:
"
Upload
release
wheels
to
PyPI
and
GitHub
"
-
label
:
"
Upload
release
wheels
to
PyPI"
depends_on
:
-
block-upload-release-wheels
id
:
upload-release-wheels
agents
:
queue
:
small_cpu_queue_postmerge
commands
:
-
"
bash
.buildkite/scripts/upload-release-wheels.sh"
-
"
bash
.buildkite/scripts/upload-release-wheels
-pypi
.sh"
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
...
...
.buildkite/scripts/annotate-release.sh
View file @
fc7980db
...
...
@@ -11,58 +11,80 @@ fi
buildkite-agent annotate
--style
'info'
--context
'release-workflow'
<<
EOF
To download the wheel (by commit):
\`\`\`
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux
2014
_aarch64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux
_2_3
1_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux
_2_31
_aarch64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cu1
29
-cp38-abi3-manylinux
1
_x86_64.whl .
\`\`\`
(Optional) For CUDA 13.0:
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cu1
30
-cp38-abi3-manylinux
_2_35
_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
To download the wheel (by version):
(Optional) For CPU:
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
BUILDKITE_COMMIT
}
/vllm-
${
RELEASE_VERSION
}
+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
\`\`\`
aws s3 cp s3://vllm-wheels/
${
RELEASE_VERSION
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
RELEASE_VERSION
}
/vllm-
${
RELEASE_VERSION
}
-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/
${
RELEASE_VERSION
}
+cu129/vllm-
${
RELEASE_VERSION
}
+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/
${
RELEASE_VERSION
}
+cu130/vllm-
${
RELEASE_VERSION
}
+cu130-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm
Tag and push images:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64-cu130
docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-rocm
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-rocm vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-rocm vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-rocm
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-rocm
Create multi-arch manifest:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm-base vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-base
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
-base vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
-base
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:
${
BUILDKITE_COMMIT
}
vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v
${
RELEASE_VERSION
}
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v
${
RELEASE_VERSION
}
vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v
${
RELEASE_VERSION
}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v
${
RELEASE_VERSION
}
-cu130 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-x86_64-cu130 vllm/vllm-openai:v
${
RELEASE_VERSION
}
-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v
${
RELEASE_VERSION
}
-cu130
\`\`\`
EOF
.buildkite/scripts/upload-release-wheels.sh
→
.buildkite/scripts/upload-release-wheels
-pypi
.sh
View file @
fc7980db
...
...
@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX
=
"s3://
$BUCKET
/
$SUBPATH
/"
RELEASE_VERSION
=
$(
buildkite-agent meta-data get release-version
)
echo
"Release version from Buildkite:
$RELEASE_VERSION
"
GIT_VERSION
=
$(
git describe
--exact-match
--tags
$BUILDKITE_COMMIT
2>/dev/null
)
if
[
-z
"
$GIT_VERSION
"
]
;
then
echo
"Release version from Buildkite:
$RELEASE_VERSION
"
if
[[
-z
"
$GIT_VERSION
"
]]
;
then
echo
"[FATAL] Not on a git tag, cannot create release."
exit
1
else
echo
"Git version for commit
$BUILDKITE_COMMIT
:
$GIT_VERSION
"
fi
# sanity check for version mismatch
if
[
"
$RELEASE_VERSION
"
!=
"
$GIT_VERSION
"
]
;
then
if
[
"
$FORCE_RELEASE_IGNORE_VERSION_MISMATCH
"
==
"true"
]
;
then
if
[
[
"
$RELEASE_VERSION
"
!=
"
$GIT_VERSION
"
]
]
;
then
if
[
[
"
$FORCE_RELEASE_IGNORE_VERSION_MISMATCH
"
==
"true"
]
]
;
then
echo
"[WARNING] Force release and ignore version mismatch"
else
echo
"[FATAL] Release version from Buildkite does not match Git version."
...
...
@@ -27,7 +29,7 @@ fi
PURE_VERSION
=
${
RELEASE_VERSION
#v
}
# remove leading 'v'
# check pypi token
if
[
-z
"
$PYPI_TOKEN
"
]
;
then
if
[
[
-z
"
$PYPI_TOKEN
"
]
]
;
then
echo
"[FATAL] PYPI_TOKEN is not set."
exit
1
else
...
...
@@ -35,41 +37,8 @@ else
export
TWINE_PASSWORD
=
"
$PYPI_TOKEN
"
fi
# check github token
if
[
-z
"
$GITHUB_TOKEN
"
]
;
then
echo
"[FATAL] GITHUB_TOKEN is not set."
exit
1
else
export
GH_TOKEN
=
"
$GITHUB_TOKEN
"
fi
set
-x
# avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION
=
$(
curl
-s
https://api.github.com/repos/cli/cli/releases/latest |
grep
'"tag_name":'
|
sed
-E
's/.*"([^"]+)".*/\1/'
|
sed
's/^v//'
)
if
[
-z
"
$GH_VERSION
"
]
;
then
echo
"[FATAL] Failed to get latest gh CLI version from GitHub"
exit
1
fi
echo
"Downloading gh CLI version:
$GH_VERSION
"
GH_TARBALL
=
"gh_
${
GH_VERSION
}
_linux_amd64.tar.gz"
GH_URL
=
"https://github.com/cli/cli/releases/download/v
${
GH_VERSION
}
/
${
GH_TARBALL
}
"
GH_INSTALL_DIR
=
"/tmp/gh-install"
mkdir
-p
"
$GH_INSTALL_DIR
"
pushd
"
$GH_INSTALL_DIR
"
curl
-L
-o
"
$GH_TARBALL
"
"
$GH_URL
"
tar
-xzf
"
$GH_TARBALL
"
GH_BIN
=
$(
realpath
$(
find
.
-name
"gh"
-type
f
-executable
|
head
-n
1
))
if
[
-z
"
$GH_BIN
"
]
;
then
echo
"[FATAL] Failed to find gh CLI executable"
exit
1
fi
echo
"gh CLI downloaded successfully, version:
$(
$GH_BIN
--version
)
"
echo
"Last 5 releases on GitHub:"
# as a sanity check of gh and GH_TOKEN
command
"
$GH_BIN
"
release list
--limit
5
popd
# install twine from pypi
python3
-m
venv /tmp/vllm-release-env
source
/tmp/vllm-release-env/bin/activate
...
...
@@ -89,16 +58,13 @@ echo "Wheels copied to local directory"
git archive
--format
=
tar.gz
--output
=
"
$DIST_DIR
/vllm-
${
PURE_VERSION
}
.tar.gz"
$BUILDKITE_COMMIT
ls
-la
$DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES
=
$(
find
$DIST_DIR
-name
"vllm-
${
PURE_VERSION
}
*.whl"
-not
-name
"*+*"
)
if
[
-z
"
$PYPI_WHEEL_FILES
"
]
;
then
if
[
[
-z
"
$PYPI_WHEEL_FILES
"
]
]
;
then
echo
"No default variant wheels found, quitting..."
exit
1
fi
python3
-m
twine check
$PYPI_WHEEL_FILES
python3
-m
twine
--non-interactive
--verbose
upload
$PYPI_WHEEL_FILES
python3
-m
twine
upload
--non-interactive
--verbose
$PYPI_WHEEL_FILES
echo
"Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command
"
$GH_BIN
"
release create
$GIT_VERSION
-d
--latest
--notes-from-tag
--verify-tag
$DIST_DIR
/
*
.whl
docs/models/supported_models.md
View file @
fc7980db
...
...
@@ -456,6 +456,7 @@ th {
|
`StableLmForCausalLM`
| StableLM |
`stabilityai/stablelm-3b-4e1t`
,
`stabilityai/stablelm-base-alpha-7b-v2`
, etc. | | |
|
`Starcoder2ForCausalLM`
| Starcoder2 |
`bigcode/starcoder2-3b`
,
`bigcode/starcoder2-7b`
,
`bigcode/starcoder2-15b`
, etc. | | ✅︎ |
|
`Step1ForCausalLM`
| Step-Audio |
`stepfun-ai/Step-Audio-EditX`
, etc. | ✅︎ | ✅︎ |
|
`Step3p5ForCausalLM`
| Step-3.5-flash |
`stepfun-ai/step-3.5-flash`
, etc. | | ✅︎ |
|
`TeleChat2ForCausalLM`
| TeleChat2 |
`Tele-AI/TeleChat2-3B`
,
`Tele-AI/TeleChat2-7B`
,
`Tele-AI/TeleChat2-35B`
, etc. | ✅︎ | ✅︎ |
|
`TeleFLMForCausalLM`
| TeleFLM |
`CofeAI/FLM-2-52B-Instruct-2407`
,
`CofeAI/Tele-FLM`
, etc. | ✅︎ | ✅︎ |
|
`XverseForCausalLM`
| XVERSE |
`xverse/XVERSE-7B-Chat`
,
`xverse/XVERSE-13B-Chat`
,
`xverse/XVERSE-65B-Chat`
, etc. | ✅︎ | ✅︎ |
...
...
examples/pooling/score/vision_rerank_api_online.py
View file @
fc7980db
...
...
@@ -18,48 +18,32 @@ e.g.
"""
import
argparse
import
base64
import
json
import
pprint
import
requests
def
encode_base64_content_from_url
(
content_url
:
str
)
->
dict
[
str
,
str
]:
"""Encode a content retrieved from a remote url to base64 format."""
with
requests
.
get
(
content_url
,
headers
=
headers
)
as
response
:
response
.
raise_for_status
()
result
=
base64
.
b64encode
(
response
.
content
).
decode
(
"utf-8"
)
return
{
"url"
:
f
"data:image/jpeg;base64,
{
result
}
"
}
headers
=
{
"accept"
:
"application/json"
,
"Content-Type"
:
"application/json"
}
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
query
=
"A woman playing with her dog on a beach at sunset."
documents
=
{
"content"
:
[
{
"type"
:
"text"
,
"text"
:
(
document
=
(
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
"as the dog offers its paw in a heartwarming display of companionship and trust."
),
)
image_url
=
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
documents
=
[
{
"type"
:
"text"
,
"text"
:
document
,
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
},
"image_url"
:
{
"url"
:
image_url
},
},
{
"type"
:
"image_url"
,
"image_url"
:
encode_base64_content_from_url
(
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
),
"image_url"
:
{
"url"
:
encode_image_url
(
fetch_image
(
image_url
))},
},
]
}
]
def
parse_args
():
...
...
@@ -74,23 +58,36 @@ def main(args):
models_url
=
base_url
+
"/v1/models"
rerank_url
=
base_url
+
"/rerank"
response
=
requests
.
get
(
models_url
,
headers
=
headers
)
response
=
requests
.
get
(
models_url
)
model
=
response
.
json
()[
"data"
][
0
][
"id"
]
data
=
{
print
(
"Query: string & Document: list of string"
)
prompt
=
{
"model"
:
model
,
"query"
:
query
,
"documents"
:
[
document
]}
response
=
requests
.
post
(
rerank_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: text"
)
prompt
=
{
"model"
:
model
,
"query"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
0
]]}}
response
=
requests
.
post
(
rerank_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: image url"
)
prompt
=
{
"model"
:
model
,
"query"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
1
]]},
}
response
=
requests
.
post
(
rerank_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: image base64"
)
prompt
=
{
"model"
:
model
,
"query"
:
query
,
"documents"
:
documents
,
"documents"
:
{
"content"
:
[
documents
[
2
]]}
,
}
response
=
requests
.
post
(
rerank_url
,
headers
=
headers
,
json
=
data
)
# Check the response
if
response
.
status_code
==
200
:
print
(
"Request successful!"
)
print
(
json
.
dumps
(
response
.
json
(),
indent
=
2
))
else
:
print
(
f
"Request failed with status code:
{
response
.
status_code
}
"
)
print
(
response
.
text
)
response
=
requests
.
post
(
rerank_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
if
__name__
==
"__main__"
:
...
...
examples/pooling/score/vision_score_api_online.py
View file @
fc7980db
...
...
@@ -17,48 +17,32 @@ e.g.
"""
import
argparse
import
base64
import
json
import
pprint
import
requests
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
def
encode_base64_content_from_url
(
content_url
:
str
)
->
dict
[
str
,
str
]:
"""Encode a content retrieved from a remote url to base64 format."""
with
requests
.
get
(
content_url
,
headers
=
headers
)
as
response
:
response
.
raise_for_status
()
result
=
base64
.
b64encode
(
response
.
content
).
decode
(
"utf-8"
)
return
{
"url"
:
f
"data:image/jpeg;base64,
{
result
}
"
}
headers
=
{
"accept"
:
"application/json"
,
"Content-Type"
:
"application/json"
}
queries
=
"slm markdown"
documents
=
{
"content"
:
[
query
=
"A woman playing with her dog on a beach at sunset."
document
=
(
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
"as the dog offers its paw in a heartwarming display of companionship and trust."
)
image_url
=
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
documents
=
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
"type"
:
"text"
,
"text"
:
document
,
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
"image_url"
:
{
"url"
:
image_url
},
},
{
"type"
:
"image_url"
,
"image_url"
:
encode_base64_content_from_url
(
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
),
"image_url"
:
{
"url"
:
encode_image_url
(
fetch_image
(
image_url
))},
},
]
}
]
def
parse_args
():
...
...
@@ -73,15 +57,40 @@ def main(args):
models_url
=
base_url
+
"/v1/models"
score_url
=
base_url
+
"/score"
response
=
requests
.
get
(
models_url
,
headers
=
headers
)
response
=
requests
.
get
(
models_url
)
model
=
response
.
json
()[
"data"
][
0
][
"id"
]
prompt
=
{
"model"
:
model
,
"queries"
:
queries
,
"documents"
:
documents
}
response
=
requests
.
post
(
score_url
,
headers
=
headers
,
json
=
prompt
)
print
(
"
\n
Prompt when queries is string and documents is a image list:"
)
pprint
.
pprint
(
prompt
)
print
(
"
\n
Score Response:"
)
print
(
json
.
dumps
(
response
.
json
(),
indent
=
2
))
print
(
"Query: string & Document: string"
)
prompt
=
{
"model"
:
model
,
"queries"
:
query
,
"documents"
:
document
}
response
=
requests
.
post
(
score_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: text"
)
prompt
=
{
"model"
:
model
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
0
]]},
}
response
=
requests
.
post
(
score_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: image url"
)
prompt
=
{
"model"
:
model
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
1
]]},
}
response
=
requests
.
post
(
score_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
print
(
"Query: string & Document: image base64"
)
prompt
=
{
"model"
:
model
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
2
]]},
}
response
=
requests
.
post
(
score_url
,
json
=
prompt
)
pprint
.
pprint
(
response
.
json
())
if
__name__
==
"__main__"
:
...
...
requirements/build.txt
View file @
fc7980db
...
...
@@ -9,5 +9,5 @@ wheel
jinja2>=3.1.6
regex
build
protobuf
protobuf
>= 6.33.5
grpcio-tools
requirements/common.txt
View file @
fc7980db
...
...
@@ -9,9 +9,9 @@ blake3
py-cpuinfo
transformers >= 4.56.0, < 5
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer, gRPC.
protobuf
>= 6.33.5
# Required by LlamaTokenizer, gRPC.
CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
aiohttp
>= 3.13.3
openai >= 1.99.1 # For Responses API with reasoning content
pydantic >= 2.12.0
prometheus_client >= 0.18.0
...
...
requirements/kv_connectors.txt
View file @
fc7980db
lmcache
lmcache
>= 0.3.9
nixl >= 0.7.1 # Required for disaggregated prefill
requirements/rocm-test.txt
View file @
fc7980db
...
...
@@ -14,7 +14,7 @@ pytest-shard==0.1.2
# Async/HTTP dependencies
anyio==4.6.2.post1
# via httpx, starlette
aiohttp==3.13.
0
aiohttp==3.13.
3
# via gpt-oss
httpx==0.27.2
# HTTP testing
...
...
requirements/test.txt
View file @
fc7980db
...
...
@@ -12,7 +12,7 @@ affine==2.4.0
# via rasterio
aiohappyeyeballs==2.6.1
# via aiohttp
aiohttp==3.13.
0
aiohttp==3.13.
3
# via
# aiohttp-cors
# datasets
...
...
tests/compile/test_cold_start.py
0 → 100644
View file @
fc7980db
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
torch._dynamo.utils
import
counters
from
vllm
import
LLM
from
vllm.config
import
CompilationConfig
,
CompilationMode
,
CUDAGraphMode
def
test_moe_compilation_cold_start
(
monkeypatch
,
use_fresh_inductor_cache
):
# Run in same process so we can access PyTorch's internal counters
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
# I'm not sure if this is going to affect the numbers
monkeypatch
.
setenv
(
"VLLM_USE_AOT_COMPILE"
,
"0"
)
# Force cold compilation
monkeypatch
.
setenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"1"
)
compilation_config
=
CompilationConfig
(
mode
=
CompilationMode
.
VLLM_COMPILE
,
cudagraph_mode
=
CUDAGraphMode
.
NONE
,
# make the model loading faster
)
counters
.
clear
()
_
=
LLM
(
model
=
"microsoft/Phi-tiny-MoE-instruct"
,
max_model_len
=
256
,
load_format
=
"dummy"
,
# make the model loading faster
compilation_config
=
compilation_config
,
num_gpu_blocks_override
=
8
,
# make the model loading faster
)
# vLLM-compile cold start is special. By default, we do
# one full dynamo capture of the entire forward pass.
# The forward pass consists of 32 transformer layers.
# Then, we split on the attention operation. This results in
# 33 subgraphs (not including the attention operation).
# The 33 subgraphs then get standalone_compile'd.
#
# There are actually only 3 unique subgraphs for this model
# (all of its transformer layers are the same modulo weights);
# this is true for most vLLM models.
# So we test that during cold start, the aot_autograd cache
# misses for 3 subgraphs and hits for the rest.
assert
counters
[
"aot_autograd"
][
"autograd_cache_miss"
]
==
3
assert
counters
[
"aot_autograd"
][
"autograd_cache_hit"
]
==
30
tests/compile/test_graph_partition.py
View file @
fc7980db
...
...
@@ -8,6 +8,10 @@ import torch
from
torch.fx.experimental.proxy_tensor
import
make_fx
from
vllm.compilation.backends
import
split_graph
from
vllm.compilation.fx_utils
import
find_op_nodes
# This import automatically registers `torch.ops.silly.attention`
from
.
import
silly_attention
# noqa: F401
def
test_getitem_moved_to_producer_subgraph
():
...
...
@@ -122,3 +126,61 @@ def test_no_tuple_inputs_with_multiple_consumers():
output_split
=
split_gm
(
new_x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
"Output mismatch after split"
def
test_consecutive_ops_in_split
():
"""
Test that consecutive splitting operations are grouped into the same subgraph
"""
def
model_fn
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
Define a simple model where consecutive operations create opportunities
for splitting subgraphs.
"""
# Apply silly attention followed by consecutive operations
intermediate
=
torch
.
relu
(
x
)
attn_inout
=
torch
.
sqrt
(
intermediate
)
torch
.
ops
.
silly
.
attention
(
intermediate
,
intermediate
,
attn_inout
,
attn_inout
)
final_result
=
torch
.
sigmoid
(
attn_inout
)
return
final_result
torch
.
set_default_device
(
"cuda"
)
# Create the traced FX graph for the model
x
=
torch
.
randn
(
8
,
4
)
gm
=
make_fx
(
model_fn
)(
x
)
# Assert presence of the expected operations in the setup
assert
(
len
(
list
(
find_op_nodes
(
torch
.
ops
.
aten
.
relu
,
gm
.
graph
)))
==
1
and
len
(
list
(
find_op_nodes
(
torch
.
ops
.
aten
.
sqrt
,
gm
.
graph
)))
==
1
),
"Test setup failed: Expected sqrt and relu operations in the graph."
# Configure split operations to test
splitting_ops
=
[
"silly::attention"
,
"aten::sqrt"
]
split_gm
,
split_items
=
split_graph
(
gm
,
splitting_ops
)
# Validate the number of partitions
assert
len
(
split_items
)
==
3
,
(
"Consecutive splitting operations were not grouped correctly."
)
# Validate that correctness is preserved
new_x
=
torch
.
randn
(
8
,
4
)
output_original
=
gm
(
new_x
)
output_split
=
split_gm
(
new_x
)
assert
torch
.
allclose
(
output_original
,
output_split
),
(
"Output mismatch after splitting."
)
# Check the splitting item has 2 nodes exactly (relu and attn)
splitting_items
=
list
(
s
for
s
in
split_items
if
s
.
is_splitting_graph
)
assert
len
(
splitting_items
)
==
1
,
"Expecting a single splitting graph"
print
(
splitting_items
[
0
].
graph
.
graph
)
splitting_gm
=
splitting_items
[
0
].
graph
assert
len
(
splitting_gm
.
graph
.
nodes
)
==
4
,
"Expecting 4 nodes in splitting graph"
assert
[
node
.
op
for
node
in
splitting_gm
.
graph
.
nodes
]
==
[
"placeholder"
]
+
2
*
[
"call_function"
]
+
[
"output"
]
tests/entrypoints/pooling/classify/test_online_vision.py
View file @
fc7980db
...
...
@@ -5,9 +5,9 @@ import json
import
pytest
import
requests
from
tests.entrypoints.test_utils
import
encode_base64_content_from_url
from
tests.utils
import
RemoteOpenAIServer
from
vllm.entrypoints.pooling.classify.protocol
import
ClassificationResponse
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
MODEL_NAME
=
"muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
MAXIMUM_VIDEOS
=
1
...
...
@@ -19,7 +19,7 @@ HF_OVERRIDES = {
}
input_text
=
"This product was excellent and exceeded my expectations"
image_url
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
image_base64
=
encode_base64_content_from_url
(
image_url
)
image_base64
=
{
"url"
:
encode_image_url
(
fetch_image
(
image_url
)
)}
video_url
=
"https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
...
...
tests/entrypoints/pooling/score/test_online_score_vision.py
0 → 100644
View file @
fc7980db
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
requests
from
tests.utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
vllm.entrypoints.pooling.score.protocol
import
ScoreResponse
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
MODEL_NAME
=
"Qwen/Qwen3-VL-Reranker-2B"
HF_OVERRIDES
=
{
"architectures"
:
[
"Qwen3VLForSequenceClassification"
],
"classifier_from_token"
:
[
"no"
,
"yes"
],
"is_original_qwen3_reranker"
:
True
,
}
query
=
"A cat standing in the snow."
image_url
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
documents
=
[
{
"type"
:
"text"
,
"text"
:
query
,
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
encode_image_url
(
fetch_image
(
image_url
))},
},
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"8192"
,
"--chat-template"
,
str
(
VLLM_PATH
/
"examples/pooling/score/template/qwen3_vl_reranker.jinja"
),
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
override_hf_configs
=
HF_OVERRIDES
)
as
remote_server
:
yield
remote_server
def
test_score_api_queries_str_documents_str
(
server
:
RemoteOpenAIServer
):
queries
=
"What is the capital of France?"
documents
=
"The capital of France is Paris."
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
queries
,
"documents"
:
documents
,
},
)
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
1
def
test_score_api_queries_str_documents_text_content
(
server
:
RemoteOpenAIServer
):
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
0
]]},
},
)
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
1
def
test_score_api_queries_str_documents_image_url_content
(
server
:
RemoteOpenAIServer
):
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
1
]]},
},
)
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
1
def
test_score_api_queries_str_documents_image_base64_content
(
server
:
RemoteOpenAIServer
,
):
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
query
,
"documents"
:
{
"content"
:
[
documents
[
2
]]},
},
)
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
1
tests/entrypoints/test_utils.py
View file @
fc7980db
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
import
requests
from
vllm.entrypoints.utils
import
sanitize_message
...
...
@@ -12,11 +8,3 @@ def test_sanitize_message():
sanitize_message
(
"<_io.BytesIO object at 0x7a95e299e750>"
)
==
"<_io.BytesIO object>"
)
def
encode_base64_content_from_url
(
content_url
:
str
)
->
dict
[
str
,
str
]:
with
requests
.
get
(
content_url
)
as
response
:
response
.
raise_for_status
()
result
=
base64
.
b64encode
(
response
.
content
).
decode
(
"utf-8"
)
return
{
"url"
:
f
"data:image/jpeg;base64,
{
result
}
"
}
tests/kernels/core/test_activation.py
View file @
fc7980db
...
...
@@ -17,6 +17,8 @@ from vllm.model_executor.layers.activation import (
QuickGELU
,
SiluAndMul
,
SwigluOAIAndMul
,
SwigluStepAndMul
,
swiglustep_and_mul_triton
,
)
from
vllm.utils.torch_utils
import
set_random_seed
...
...
@@ -36,6 +38,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
"gelu_tanh"
,
"fatrelu"
,
"swigluoai_and_mul"
,
"swiglustep_and_mul"
,
],
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
...
...
@@ -75,9 +78,12 @@ def test_act_and_mul(
elif
activation
==
"swigluoai_and_mul"
:
layer
=
SwigluOAIAndMul
()
fn
=
torch
.
ops
.
_C
.
swigluoai_and_mul
elif
activation
==
"swiglustep_and_mul"
:
layer
=
SwigluStepAndMul
()
fn
=
swiglustep_and_mul_triton
out
=
layer
(
x
)
ref_out
=
layer
.
forward_native
(
x
)
if
activation
==
"swigluoai_and_mul"
:
if
activation
in
[
"swigluoai_and_mul"
,
"swiglustep_and_mul"
]
:
rtol
=
{
# For fp16, change the relative tolerance from 1e-3 to 2e-3
torch
.
float16
:
2e-3
,
...
...
@@ -104,7 +110,7 @@ def test_act_and_mul(
opcheck
(
fn
,
(
out
,
x
,
threshold
))
elif
activation
==
"swigluoai_and_mul"
:
opcheck
(
fn
,
(
out
,
x
,
layer
.
alpha
,
layer
.
limit
))
el
se
:
el
if
activation
!=
"swiglustep_and_mul"
:
opcheck
(
fn
,
(
out
,
x
))
...
...
tests/kernels/moe/test_moe.py
View file @
fc7980db
...
...
@@ -715,7 +715,7 @@ def test_mixtral_moe(
# need to override the forward context for unittests, otherwise it assumes
# we're running the model forward pass (the model specified in vllm_config)
get_forward_context
().
remaining
_moe_layers
=
None
get_forward_context
().
all
_moe_layers
=
None
# Run forward passes for both MoE blocks
hf_states
,
_
=
hf_moe
.
forward
(
hf_inputs
)
...
...
tests/kernels/quantization/test_rocm_skinny_gemms.py
View file @
fc7980db
...
...
@@ -87,6 +87,13 @@ NKM_FACTORS_WVSPLITK_FP8 = [
SEEDS
=
[
0
]
def
pad_weights_fp8
(
weight
):
num_pad
=
256
//
weight
.
element_size
()
import
torch.nn.functional
as
F
return
F
.
pad
(
weight
,
(
0
,
num_pad
),
"constant"
,
0
)[...,
:
-
num_pad
]
@
pytest
.
mark
.
parametrize
(
"n,k,m"
,
NKM_FACTORS_WVSPLITKRC
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
...
...
@@ -191,11 +198,12 @@ def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
@
pytest
.
mark
.
parametrize
(
"n,k,m"
,
NKM_FACTORS_WVSPLITK_FP8
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"padded"
,
[
False
,
True
])
@
pytest
.
mark
.
skipif
(
not
(
current_platform
.
is_rocm
()
and
current_platform
.
supports_fp8
()),
reason
=
"only test for rocm fp8"
,
)
def
test_rocm_wvsplitk_fp8_kernel
(
n
,
k
,
m
,
dtype
,
seed
):
def
test_rocm_wvsplitk_fp8_kernel
(
n
,
k
,
m
,
dtype
,
seed
,
padded
):
torch
.
manual_seed
(
seed
)
A
=
torch
.
rand
(
n
,
k
,
device
=
"cuda"
)
-
0.5
...
...
@@ -203,6 +211,8 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
A
,
scale_a
=
ref_dynamic_per_tensor_fp8_quant
(
A
)
B
,
scale_b
=
ref_dynamic_per_tensor_fp8_quant
(
B
)
if
padded
:
B
=
pad_weights_fp8
(
B
)
ref_out
=
torch
.
_scaled_mm
(
A
,
B
.
t
(),
out_dtype
=
dtype
,
scale_a
=
scale_a
,
scale_b
=
scale_b
...
...
@@ -222,11 +232,12 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
@
pytest
.
mark
.
parametrize
(
"n,k,m"
,
NKM_FACTORS_WVSPLITK_FP8
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"padded"
,
[
False
,
True
])
@
pytest
.
mark
.
skipif
(
not
(
current_platform
.
is_rocm
()
and
current_platform
.
supports_fp8
()),
reason
=
"only test for rocm fp8"
,
)
def
test_rocm_wvsplitk_fp8_bias1D_kernel
(
n
,
k
,
m
,
dtype
,
seed
):
def
test_rocm_wvsplitk_fp8_bias1D_kernel
(
n
,
k
,
m
,
dtype
,
seed
,
padded
):
torch
.
manual_seed
(
seed
)
xavier
=
math
.
sqrt
(
2
/
k
)
# normalize to avoid large output-bias deltas
...
...
@@ -236,6 +247,8 @@ def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed):
A
,
scale_a
=
ref_dynamic_per_tensor_fp8_quant
(
A
)
B
,
scale_b
=
ref_dynamic_per_tensor_fp8_quant
(
B
)
if
padded
:
B
=
pad_weights_fp8
(
B
)
ref_out
=
torch
.
_scaled_mm
(
A
,
B
.
t
(),
out_dtype
=
dtype
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
bias
=
BIAS
...
...
tests/models/registry.py
View file @
fc7980db
...
...
@@ -479,6 +479,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Step1ForCausalLM"
:
_HfExamplesInfo
(
"stepfun-ai/Step-Audio-EditX"
,
trust_remote_code
=
True
),
"Step3p5ForCausalLM"
:
_HfExamplesInfo
(
"stepfun-ai/step-3.5-flash"
,
is_available_online
=
False
),
"SmolLM3ForCausalLM"
:
_HfExamplesInfo
(
"HuggingFaceTB/SmolLM3-3B"
),
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-zephyr-3b"
),
"StableLmForCausalLM"
:
_HfExamplesInfo
(
"stabilityai/stablelm-3b-4e1t"
),
...
...
@@ -1091,6 +1094,12 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"Qwen3NextMTP"
:
_HfExamplesInfo
(
"Qwen/Qwen3-Next-80B-A3B-Instruct"
,
min_transformers_version
=
"4.56.3"
),
"Step3p5MTP"
:
_HfExamplesInfo
(
"stepfun-ai/Step-3.5-Flash"
,
trust_remote_code
=
True
,
speculative_model
=
"stepfun-ai/Step-3.5-Flash"
,
is_available_online
=
False
,
),
}
_TRANSFORMERS_BACKEND_MODELS
=
{
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment