Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5b1db8b2
Commit
5b1db8b2
authored
Jan 19, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-ori
parents
6fa64fbe
b17039bc
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
242 additions
and
174 deletions
+242
-174
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+81
-23
.buildkite/scripts/upload-nightly-wheels.sh
.buildkite/scripts/upload-nightly-wheels.sh
+0
-0
.buildkite/scripts/upload-release-wheels.sh
.buildkite/scripts/upload-release-wheels.sh
+103
-0
docker/Dockerfile.rocm
docker/Dockerfile.rocm
+6
-0
docker/Dockerfile.rocm_base
docker/Dockerfile.rocm_base
+0
-100
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+1
-7
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+8
-3
vllm/entrypoints/openai/serving_models.py
vllm/entrypoints/openai/serving_models.py
+6
-1
vllm/entrypoints/pooling/classify/api_router.py
vllm/entrypoints/pooling/classify/api_router.py
+3
-5
vllm/entrypoints/pooling/embed/api_router.py
vllm/entrypoints/pooling/embed/api_router.py
+2
-4
vllm/entrypoints/pooling/pooling/api_router.py
vllm/entrypoints/pooling/pooling/api_router.py
+3
-4
vllm/entrypoints/pooling/score/api_router.py
vllm/entrypoints/pooling/score/api_router.py
+5
-7
vllm/entrypoints/serve/disagg/api_router.py
vllm/entrypoints/serve/disagg/api_router.py
+2
-3
vllm/entrypoints/serve/tokenize/api_router.py
vllm/entrypoints/serve/tokenize/api_router.py
+1
-7
vllm/entrypoints/utils.py
vllm/entrypoints/utils.py
+21
-10
No files found.
.buildkite/release-pipeline.yaml
View file @
5b1db8b2
steps
:
steps
:
# aarch64 + CUDA builds
# aarch64 + CUDA builds
-
label
:
"
Build
arm64
wheel
-
CUDA
12.9"
-
label
:
"
Build
wheel
-
aarch64
-
CUDA
12.9"
depends_on
:
~
depends_on
:
~
id
:
build-wheel-arm64-cuda-12-9
id
:
build-wheel-arm64-cuda-12-9
agents
:
agents
:
...
@@ -11,11 +11,11 @@ steps:
...
@@ -11,11 +11,11 @@ steps:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh"
-
"
bash
.buildkite/scripts/upload-
nightly-
wheels.sh"
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
-
label
:
"
Build
arm64
wheel
-
CUDA
13.0"
-
label
:
"
Build
wheel
-
aarch64
-
CUDA
13.0"
depends_on
:
~
depends_on
:
~
id
:
build-wheel-arm64-cuda-13-0
id
:
build-wheel-arm64-cuda-13-0
agents
:
agents
:
...
@@ -26,12 +26,12 @@ steps:
...
@@ -26,12 +26,12 @@ steps:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_35"
-
"
bash
.buildkite/scripts/upload-
nightly-
wheels.sh
manylinux_2_35"
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
# aarch64 build
# aarch64 build
-
label
:
"
Build
arm
64
CPU
wheel
"
-
label
:
"
Build
wheel
-
aarch
64
-
CPU"
depends_on
:
~
depends_on
:
~
id
:
build-wheel-arm64-cpu
id
:
build-wheel-arm64-cpu
agents
:
agents
:
...
@@ -40,39 +40,39 @@ steps:
...
@@ -40,39 +40,39 @@ steps:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_BUILD_ACL=ON
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_BUILD_ACL=ON
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_35"
-
"
bash
.buildkite/scripts/upload-
nightly-
wheels.sh
manylinux_2_35"
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
# x86 + CUDA builds
# x86 + CUDA builds
-
label
:
"
Build
wheel
-
CUDA
12.9"
-
label
:
"
Build
wheel
-
x86_64
-
CUDA
12.9"
depends_on
:
~
depends_on
:
~
id
:
build-wheel-cuda-12-9
id
:
build-wheel-
x86-
cuda-12-9
agents
:
agents
:
queue
:
cpu_queue_postmerge
queue
:
cpu_queue_postmerge
commands
:
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_31"
-
"
bash
.buildkite/scripts/upload-
nightly-
wheels.sh
manylinux_2_31"
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
-
label
:
"
Build
wheel
-
CUDA
13.0"
-
label
:
"
Build
wheel
-
x86_64
-
CUDA
13.0"
depends_on
:
~
depends_on
:
~
id
:
build-wheel-cuda-13-0
id
:
build-wheel-
x86-
cuda-13-0
agents
:
agents
:
queue
:
cpu_queue_postmerge
queue
:
cpu_queue_postmerge
commands
:
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_35"
-
"
bash
.buildkite/scripts/upload-
nightly-
wheels.sh
manylinux_2_35"
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
# x86 CPU wheel build
# x86 CPU wheel build
-
label
:
"
Build
x86
CPU
wheel
"
-
label
:
"
Build
wheel
-
x86_64
-
CPU
"
depends_on
:
~
depends_on
:
~
id
:
build-wheel-x86-cpu
id
:
build-wheel-x86-cpu
agents
:
agents
:
...
@@ -81,12 +81,12 @@ steps:
...
@@ -81,12 +81,12 @@ steps:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_AVX512BF16=true
--build-arg
VLLM_CPU_AVX512VNNI=true
--build-arg
VLLM_CPU_AMXBF16=true
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_AVX512BF16=true
--build-arg
VLLM_CPU_AVX512VNNI=true
--build-arg
VLLM_CPU_AMXBF16=true
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh
manylinux_2_35"
-
"
bash
.buildkite/scripts/upload-
nightly-
wheels.sh
manylinux_2_35"
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
# Build release images (12.9)
# Build release images (
CUDA
12.9)
-
label
:
"
Build
release
image
(
x86
)
"
-
label
:
"
Build
release
image
-
x86
_64
-
CUDA
12.9
"
depends_on
:
~
depends_on
:
~
id
:
build-release-image-x86
id
:
build-release-image-x86
agents
:
agents
:
...
@@ -99,7 +99,7 @@ steps:
...
@@ -99,7 +99,7 @@ steps:
-
"
docker
tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
"
docker
tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
label
:
"
Build
release
image
(arm64)
"
-
label
:
"
Build
release
image
-
aarch64
-
CUDA
12.9
"
depends_on
:
~
depends_on
:
~
id
:
build-release-image-arm64
id
:
build-release-image-arm64
agents
:
agents
:
...
@@ -109,34 +109,92 @@ steps:
...
@@ -109,34 +109,92 @@ steps:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)"
# Add job to create multi-arch manifest
-
label
:
"
Create
multi-arch
manifest
-
CUDA
12.9"
-
label
:
"
Create
multi-arch
manifest"
depends_on
:
depends_on
:
-
build-release-image-x86
-
build-release-image-x86
-
build-release-image-arm64
-
build-release-image-arm64
id
:
create-multi-arch-manifest
id
:
create-multi-arch-manifest
agents
:
agents
:
queue
:
cpu_queue_postmerge
queue
:
small_
cpu_queue_postmerge
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
docker
manifest
create
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64
--amend"
-
"
docker
manifest
create
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64
--amend"
-
"
docker
manifest
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
"
docker
manifest
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
label
:
"
Annotate
release
workflow"
-
label
:
"
Annotate
release
workflow
-
CUDA
12.9
"
depends_on
:
depends_on
:
-
create-multi-arch-manifest
-
create-multi-arch-manifest
id
:
annotate-release-workflow
id
:
annotate-release-workflow
agents
:
agents
:
queue
:
cpu_queue_postmerge
queue
:
small_
cpu_queue_postmerge
commands
:
commands
:
-
"
bash
.buildkite/scripts/annotate-release.sh"
-
"
bash
.buildkite/scripts/annotate-release.sh"
-
block
:
"
Build
CUDA
13.0
release
images"
key
:
block-release-image-build-cuda-13-0
depends_on
:
~
-
label
:
"
Build
release
image
-
x86_64
-
CUDA
13.0"
depends_on
:
block-release-image-build-cuda-13-0
id
:
build-release-image-x86-cuda-13-0
agents
:
queue
:
cpu_queue_postmerge
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.2
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
INSTALL_KV_CONNECTORS=true
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130"
# re-tag to default image tag and push, just in case arm64 build fails
-
"
docker
tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
label
:
"
Build
release
image
-
aarch64
-
CUDA
13.0"
depends_on
:
block-release-image-build-cuda-13-0
id
:
build-release-image-arm64-cuda-13-0
agents
:
queue
:
arm64_cpu_queue_postmerge
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.2
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
INSTALL_KV_CONNECTORS=true
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130"
-
label
:
"
Create
multi-arch
manifest
-
CUDA
13.0"
depends_on
:
-
build-release-image-x86-cuda-13-0
-
build-release-image-arm64-cuda-13-0
id
:
create-multi-arch-manifest-cuda-13-0
agents
:
queue
:
small_cpu_queue_postmerge
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
docker
manifest
create
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130
--amend"
-
"
docker
manifest
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
input
:
"
Provide
Release
version
here"
-
input
:
"
Provide
Release
version
here"
id
:
input-release-version
id
:
input-release-version
fields
:
fields
:
-
text
:
"
What
is
the
release
version?"
-
text
:
"
What
is
the
release
version?"
key
:
release-version
key
:
release-version
-
block
:
"
Confirm
update
release
wheels
to
PyPI
(experimental,
use
with
caution)?"
key
:
block-upload-release-wheels
depends_on
:
-
input-release-version
-
build-wheel-x86-cuda-12-9
-
build-wheel-x86-cuda-13-0
-
build-wheel-x86-cpu
-
build-wheel-arm64-cuda-12-9
-
build-wheel-arm64-cuda-13-0
-
build-wheel-arm64-cpu
-
label
:
"
Upload
release
wheels
to
PyPI
and
GitHub"
depends_on
:
-
block-upload-release-wheels
id
:
upload-release-wheels
agents
:
queue
:
small_cpu_queue_postmerge
commands
:
-
"
bash
.buildkite/scripts/upload-release-wheels.sh"
-
block
:
"
Build
CPU
release
image"
-
block
:
"
Build
CPU
release
image"
key
:
block-cpu-release-image-build
key
:
block-cpu-release-image-build
depends_on
:
~
depends_on
:
~
...
@@ -192,7 +250,7 @@ steps:
...
@@ -192,7 +250,7 @@ steps:
-
create-multi-arch-manifest
-
create-multi-arch-manifest
if
:
build.env("NIGHTLY") == "1"
if
:
build.env("NIGHTLY") == "1"
agents
:
agents
:
queue
:
cpu_queue_postmerge
queue
:
small_
cpu_queue_postmerge
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
docker
pull
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
-
"
docker
pull
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
...
...
.buildkite/scripts/upload-wheels.sh
→
.buildkite/scripts/upload-
nightly-
wheels.sh
View file @
5b1db8b2
File moved
.buildkite/scripts/upload-release-wheels.sh
0 → 100644
View file @
5b1db8b2
#!/usr/bin/env bash
set
-e
BUCKET
=
"vllm-wheels"
SUBPATH
=
$BUILDKITE_COMMIT
S3_COMMIT_PREFIX
=
"s3://
$BUCKET
/
$SUBPATH
/"
RELEASE_VERSION
=
$(
buildkite-agent meta-data get release-version
)
echo
"Release version from Buildkite:
$RELEASE_VERSION
"
GIT_VERSION
=
$(
git describe
--exact-match
--tags
$BUILDKITE_COMMIT
2>/dev/null
)
if
[
-z
"
$GIT_VERSION
"
]
;
then
echo
"[FATAL] Not on a git tag, cannot create release."
exit
1
else
echo
"Git version for commit
$BUILDKITE_COMMIT
:
$GIT_VERSION
"
fi
# sanity check for version mismatch
if
[
"v
$RELEASE_VERSION
"
!=
"
$GIT_VERSION
"
]
;
then
if
[
"
$FORCE_RELEASE_IGNORE_VERSION_MISMATCH
"
==
"true"
]
;
then
echo
"[WARNING] Force release and ignore version mismatch"
else
echo
"[FATAL] Release version from Buildkite does not match Git version."
exit
1
fi
fi
# check pypi token
if
[
-z
"
$PYPI_TOKEN
"
]
;
then
echo
"[FATAL] PYPI_TOKEN is not set."
exit
1
else
export
TWINE_USERNAME
=
"__token__"
export
TWINE_PASSWORD
=
"
$PYPI_TOKEN
"
fi
# check github token
if
[
-z
"
$GITHUB_TOKEN
"
]
;
then
echo
"[FATAL] GITHUB_TOKEN is not set."
exit
1
else
export
GH_TOKEN
=
"
$GITHUB_TOKEN
"
fi
set
-x
# avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION
=
$(
curl
-s
https://api.github.com/repos/cli/cli/releases/latest |
grep
'"tag_name":'
|
sed
-E
's/.*"([^"]+)".*/\1/'
|
sed
's/^v//'
)
if
[
-z
"
$GH_VERSION
"
]
;
then
echo
"[FATAL] Failed to get latest gh CLI version from GitHub"
exit
1
fi
echo
"Downloading gh CLI version:
$GH_VERSION
"
GH_TARBALL
=
"gh_
${
GH_VERSION
}
_linux_amd64.tar.gz"
GH_URL
=
"https://github.com/cli/cli/releases/download/v
${
GH_VERSION
}
/
${
GH_TARBALL
}
"
GH_INSTALL_DIR
=
"/tmp/gh-install"
mkdir
-p
"
$GH_INSTALL_DIR
"
pushd
"
$GH_INSTALL_DIR
"
curl
-L
-o
"
$GH_TARBALL
"
"
$GH_URL
"
tar
-xzf
"
$GH_TARBALL
"
GH_BIN
=
$(
realpath
$(
find
.
-name
"gh"
-type
f
-executable
|
head
-n
1
))
if
[
-z
"
$GH_BIN
"
]
;
then
echo
"[FATAL] Failed to find gh CLI executable"
exit
1
fi
echo
"gh CLI downloaded successfully, version:
$(
$GH_BIN
--version
)
"
echo
"Last 5 releases on GitHub:"
# as a sanity check of gh and GH_TOKEN
command
"
$GH_BIN
"
release list
--limit
5
popd
# install twine from pypi
python3
-m
venv /tmp/vllm-release-env
source
/tmp/vllm-release-env/bin/activate
pip
install
twine
python3
-m
twine
--version
# copy release wheels to local directory
DIST_DIR
=
/tmp/vllm-release-dist
echo
"Existing wheels on S3:"
aws s3
ls
"
$S3_COMMIT_PREFIX
"
echo
"Copying wheels to local directory"
mkdir
-p
$DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
aws s3
cp
--recursive
--exclude
"*"
--include
"vllm-
${
RELEASE_VERSION
}
*.whl"
--exclude
"*dev*"
--exclude
"*rc*"
"
$S3_COMMIT_PREFIX
"
$DIST_DIR
echo
"Wheels copied to local directory"
# generate source tarball
git archive
--format
=
tar.gz
--output
=
"
$DIST_DIR
/vllm-
${
RELEASE_VERSION
}
.tar.gz"
$BUILDKITE_COMMIT
ls
-la
$DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES
=
$(
find
$DIST_DIR
-name
"vllm-
${
RELEASE_VERSION
}
*.whl"
-not
-name
"*+*"
)
if
[
-z
"
$PYPI_WHEEL_FILES
"
]
;
then
echo
"No default variant wheels found, quitting..."
exit
1
fi
python3
-m
twine check
$PYPI_WHEEL_FILES
python3
-m
twine
--non-interactive
--verbose
upload
$PYPI_WHEEL_FILES
echo
"Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command
"
$GH_BIN
"
release create
$GIT_VERSION
-d
--latest
--notes-from-tag
--verify-tag
$DIST_DIR
/
*
.whl
docker/Dockerfile.rocm
View file @
5b1db8b2
...
@@ -85,6 +85,8 @@ ONBUILD COPY ./ vllm/
...
@@ -85,6 +85,8 @@ ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1
FROM base AS fetch_vllm_1
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main"
ARG VLLM_BRANCH="main"
ENV VLLM_REPO=${VLLM_REPO}
ENV VLLM_BRANCH=${VLLM_BRANCH}
ONBUILD RUN git clone ${VLLM_REPO} \
ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \
&& cd vllm \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
...
@@ -301,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
...
@@ -301,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
&& pip uninstall -y vllm \
&& pip uninstall -y vllm \
&& uv pip install --system *.whl
&& uv pip install --system *.whl
# Install RIXL wheel
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
uv pip install --system /rixl_install/*.whl
WORKDIR /vllm-workspace
WORKDIR /vllm-workspace
ARG COMMON_WORKDIR
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
...
...
docker/Dockerfile.rocm_base
View file @
5b1db8b2
...
@@ -198,92 +198,6 @@ RUN cd mori \
...
@@ -198,92 +198,6 @@ RUN cd mori \
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
###
### RIXL Build
###
FROM build_pytorch AS build_rixl
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO
ENV ROCM_PATH=/opt/rocm
ENV UCX_HOME=/usr/local/ucx
ENV RIXL_HOME=/usr/local/rixl
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
# RIXL build system dependences and RDMA support
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
libcpprest-dev \
libaio-dev \
librdmacm1 \
librdmacm-dev \
libibverbs1 \
libibverbs-dev \
ibverbs-utils \
rdmacm-utils \
ibverbs-providers
RUN pip install meson auditwheel patchelf tomlkit
WORKDIR /workspace
RUN git clone ${ETCD_REPO} && \
cd etcd-cpp-apiv3 && \
git checkout ${ETCD_BRANCH} && \
mkdir build && cd build && \
cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
make -j$(nproc) && \
make install
RUN cd /usr/local/src && \
git clone ${UCX_REPO} && \
cd ucx && \
git checkout ${UCX_BRANCH} && \
./autogen.sh && \
mkdir build && cd build && \
../configure \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-devel-headers \
--with-rocm=/opt/rocm \
--with-verbs \
--with-dm \
--enable-mt && \
make -j && \
make -j install
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
RUN git clone ${RIXL_REPO} /opt/rixl && \
cd /opt/rixl && \
git checkout ${RIXL_BRANCH} && \
meson setup build --prefix=${RIXL_HOME} \
-Ducx_path=${UCX_HOME} \
-Drocm_path=${ROCM_PATH} && \
cd build && \
ninja && \
ninja install
# Generate RIXL wheel
RUN cd /opt/rixl && mkdir -p /app/install && \
./contrib/build-wheel.sh \
--output-dir /app/install \
--rocm-dir ${ROCM_PATH} \
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
###
###
### FlashAttention Build
### FlashAttention Build
###
###
...
@@ -365,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
...
@@ -365,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
FROM base AS final
FROM base AS final
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
...
@@ -385,12 +297,6 @@ ARG FA_BRANCH
...
@@ -385,12 +297,6 @@ ARG FA_BRANCH
ARG FA_REPO
ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_BRANCH
ARG AITER_REPO
ARG AITER_REPO
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO
ARG MORI_BRANCH
ARG MORI_BRANCH
ARG MORI_REPO
ARG MORI_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
...
@@ -406,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
...
@@ -406,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
&& echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
&& echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
&& echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
&& echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
&& echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
&& echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
&& echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
&& echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
vllm/entrypoints/openai/api_server.py
View file @
5b1db8b2
...
@@ -540,14 +540,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
...
@@ -540,14 +540,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
try
:
try
:
generator
=
await
handler
.
create_completion
(
request
,
raw_request
)
generator
=
await
handler
.
create_completion
(
request
,
raw_request
)
except
OverflowError
as
e
:
raise
HTTPException
(
status_code
=
HTTPStatus
.
BAD_REQUEST
.
value
,
detail
=
str
(
e
)
)
from
e
except
Exception
as
e
:
except
Exception
as
e
:
raise
HTTPException
(
return
handler
.
create_error_response
(
e
)
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
return
JSONResponse
(
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
5b1db8b2
...
@@ -86,7 +86,7 @@ from vllm.entrypoints.responses_utils import (
...
@@ -86,7 +86,7 @@ from vllm.entrypoints.responses_utils import (
construct_input_messages
,
construct_input_messages
,
)
)
from
vllm.entrypoints.serve.disagg.protocol
import
GenerateRequest
,
GenerateResponse
from
vllm.entrypoints.serve.disagg.protocol
import
GenerateRequest
,
GenerateResponse
from
vllm.entrypoints.utils
import
_validate_truncation_size
from
vllm.entrypoints.utils
import
_validate_truncation_size
,
sanitize_message
from
vllm.inputs.data
import
PromptType
,
TokensPrompt
from
vllm.inputs.data
import
PromptType
,
TokensPrompt
from
vllm.inputs.parse
import
(
from
vllm.inputs.parse
import
(
PromptComponents
,
PromptComponents
,
...
@@ -760,11 +760,15 @@ class OpenAIServing:
...
@@ -760,11 +760,15 @@ class OpenAIServing:
err_type
=
"BadRequestError"
err_type
=
"BadRequestError"
status_code
=
HTTPStatus
.
BAD_REQUEST
status_code
=
HTTPStatus
.
BAD_REQUEST
param
=
exc
.
parameter
param
=
exc
.
parameter
elif
isinstance
(
exc
,
(
ValueError
,
TypeError
,
RuntimeError
)):
elif
isinstance
(
exc
,
(
ValueError
,
TypeError
,
RuntimeError
,
OverflowError
)):
# Common validation errors from user input
# Common validation errors from user input
err_type
=
"BadRequestError"
err_type
=
"BadRequestError"
status_code
=
HTTPStatus
.
BAD_REQUEST
status_code
=
HTTPStatus
.
BAD_REQUEST
param
=
None
param
=
None
elif
isinstance
(
exc
,
NotImplementedError
):
err_type
=
"NotImplementedError"
status_code
=
HTTPStatus
.
NOT_IMPLEMENTED
param
=
None
elif
exc
.
__class__
.
__name__
==
"TemplateError"
:
elif
exc
.
__class__
.
__name__
==
"TemplateError"
:
# jinja2.TemplateError (avoid importing jinja2)
# jinja2.TemplateError (avoid importing jinja2)
err_type
=
"BadRequestError"
err_type
=
"BadRequestError"
...
@@ -783,9 +787,10 @@ class OpenAIServing:
...
@@ -783,9 +787,10 @@ class OpenAIServing:
traceback
.
print_exc
()
traceback
.
print_exc
()
else
:
else
:
traceback
.
print_stack
()
traceback
.
print_stack
()
return
ErrorResponse
(
return
ErrorResponse
(
error
=
ErrorInfo
(
error
=
ErrorInfo
(
message
=
message
,
message
=
sanitize_message
(
message
)
,
type
=
err_type
,
type
=
err_type
,
code
=
status_code
.
value
,
code
=
status_code
.
value
,
param
=
param
,
param
=
param
,
...
...
vllm/entrypoints/openai/serving_models.py
View file @
5b1db8b2
...
@@ -16,6 +16,7 @@ from vllm.entrypoints.openai.protocol import (
...
@@ -16,6 +16,7 @@ from vllm.entrypoints.openai.protocol import (
ModelPermission
,
ModelPermission
,
UnloadLoRAAdapterRequest
,
UnloadLoRAAdapterRequest
,
)
)
from
vllm.entrypoints.utils
import
sanitize_message
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.resolver
import
LoRAResolver
,
LoRAResolverRegistry
from
vllm.lora.resolver
import
LoRAResolver
,
LoRAResolverRegistry
...
@@ -300,5 +301,9 @@ def create_error_response(
...
@@ -300,5 +301,9 @@ def create_error_response(
status_code
:
HTTPStatus
=
HTTPStatus
.
BAD_REQUEST
,
status_code
:
HTTPStatus
=
HTTPStatus
.
BAD_REQUEST
,
)
->
ErrorResponse
:
)
->
ErrorResponse
:
return
ErrorResponse
(
return
ErrorResponse
(
error
=
ErrorInfo
(
message
=
message
,
type
=
err_type
,
code
=
status_code
.
value
)
error
=
ErrorInfo
(
message
=
sanitize_message
(
message
),
type
=
err_type
,
code
=
status_code
.
value
,
)
)
)
vllm/entrypoints/pooling/classify/api_router.py
View file @
5b1db8b2
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
HTTPException
,
Request
from
fastapi
import
APIRouter
,
Depends
,
Request
from
starlette.responses
import
JSONResponse
from
starlette.responses
import
JSONResponse
from
typing_extensions
import
assert_never
from
typing_extensions
import
assert_never
...
@@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request):
...
@@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request):
try
:
try
:
generator
=
await
handler
.
create_classify
(
request
,
raw_request
)
generator
=
await
handler
.
create_classify
(
request
,
raw_request
)
except
Exception
as
e
:
except
Exception
as
e
:
raise
HTTPException
(
return
handler
.
create_error_response
(
e
)
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
...
...
vllm/entrypoints/pooling/embed/api_router.py
View file @
5b1db8b2
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
HTTPException
,
Request
from
fastapi
import
APIRouter
,
Depends
,
Request
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
from
typing_extensions
import
assert_never
from
typing_extensions
import
assert_never
...
@@ -47,9 +47,7 @@ async def create_embedding(
...
@@ -47,9 +47,7 @@ async def create_embedding(
try
:
try
:
generator
=
await
handler
.
create_embedding
(
request
,
raw_request
)
generator
=
await
handler
.
create_embedding
(
request
,
raw_request
)
except
Exception
as
e
:
except
Exception
as
e
:
raise
HTTPException
(
return
handler
.
create_error_response
(
e
)
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
return
JSONResponse
(
...
...
vllm/entrypoints/pooling/pooling/api_router.py
View file @
5b1db8b2
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
HTTPException
,
Request
from
fastapi
import
APIRouter
,
Depends
,
Request
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
from
typing_extensions
import
assert_never
from
typing_extensions
import
assert_never
...
@@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
...
@@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
try
:
try
:
generator
=
await
handler
.
create_pooling
(
request
,
raw_request
)
generator
=
await
handler
.
create_pooling
(
request
,
raw_request
)
except
Exception
as
e
:
except
Exception
as
e
:
raise
HTTPException
(
return
handler
.
create_error_response
(
e
)
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
...
...
vllm/entrypoints/pooling/score/api_router.py
View file @
5b1db8b2
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
HTTPException
,
Request
from
fastapi
import
APIRouter
,
Depends
,
Request
from
fastapi.responses
import
JSONResponse
from
fastapi.responses
import
JSONResponse
from
typing_extensions
import
assert_never
from
typing_extensions
import
assert_never
...
@@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request):
...
@@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request):
try
:
try
:
generator
=
await
handler
.
create_score
(
request
,
raw_request
)
generator
=
await
handler
.
create_score
(
request
,
raw_request
)
except
Exception
as
e
:
except
Exception
as
e
:
raise
HTTPException
(
return
handler
.
create_error_response
(
e
)
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
...
@@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
...
@@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
try
:
try
:
generator
=
await
handler
.
do_rerank
(
request
,
raw_request
)
generator
=
await
handler
.
do_rerank
(
request
,
raw_request
)
except
Exception
as
e
:
except
Exception
as
e
:
raise
HTTPException
(
return
handler
.
create_error_response
(
e
)
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
...
...
vllm/entrypoints/serve/disagg/api_router.py
View file @
5b1db8b2
...
@@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request):
...
@@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request):
try
:
try
:
generator
=
await
handler
.
serve_tokens
(
request
,
raw_request
)
generator
=
await
handler
.
serve_tokens
(
request
,
raw_request
)
except
Exception
as
e
:
except
Exception
as
e
:
raise
HTTPException
(
return
handler
.
create_error_response
(
e
)
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
...
...
vllm/entrypoints/serve/tokenize/api_router.py
View file @
5b1db8b2
...
@@ -49,14 +49,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
...
@@ -49,14 +49,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
try
:
try
:
generator
=
await
handler
.
create_tokenize
(
request
,
raw_request
)
generator
=
await
handler
.
create_tokenize
(
request
,
raw_request
)
except
NotImplementedError
as
e
:
raise
HTTPException
(
status_code
=
HTTPStatus
.
NOT_IMPLEMENTED
.
value
,
detail
=
str
(
e
)
)
from
e
except
Exception
as
e
:
except
Exception
as
e
:
raise
HTTPException
(
return
handler
.
create_error_response
(
e
)
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
return
JSONResponse
(
...
...
vllm/entrypoints/utils.py
View file @
5b1db8b2
...
@@ -7,7 +7,7 @@ import functools
...
@@ -7,7 +7,7 @@ import functools
import
os
import
os
from
argparse
import
Namespace
from
argparse
import
Namespace
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Any
from
typing
import
TYPE_CHECKING
,
Any
import
regex
as
re
import
regex
as
re
from
fastapi
import
Request
from
fastapi
import
Request
...
@@ -22,18 +22,25 @@ from vllm.entrypoints.chat_utils import (
...
@@ -22,18 +22,25 @@ from vllm.entrypoints.chat_utils import (
resolve_hf_chat_template
,
resolve_hf_chat_template
,
resolve_mistral_chat_template
,
resolve_mistral_chat_template
,
)
)
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
CompletionRequest
,
StreamOptions
,
)
from
vllm.entrypoints.openai.serving_models
import
LoRAModulePath
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
if
TYPE_CHECKING
:
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
CompletionRequest
,
StreamOptions
,
)
from
vllm.entrypoints.openai.serving_models
import
LoRAModulePath
else
:
ChatCompletionRequest
=
object
CompletionRequest
=
object
StreamOptions
=
object
LoRAModulePath
=
object
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
VLLM_SUBCMD_PARSER_EPILOG
=
(
VLLM_SUBCMD_PARSER_EPILOG
=
(
...
@@ -206,7 +213,7 @@ def _validate_truncation_size(
...
@@ -206,7 +213,7 @@ def _validate_truncation_size(
def
get_max_tokens
(
def
get_max_tokens
(
max_model_len
:
int
,
max_model_len
:
int
,
request
:
ChatCompletionRequest
|
CompletionRequest
,
request
:
"
ChatCompletionRequest | CompletionRequest
"
,
input_length
:
int
,
input_length
:
int
,
default_sampling_params
:
dict
,
default_sampling_params
:
dict
,
)
->
int
:
)
->
int
:
...
@@ -227,6 +234,8 @@ def get_max_tokens(
...
@@ -227,6 +234,8 @@ def get_max_tokens(
def
log_non_default_args
(
args
:
Namespace
|
EngineArgs
):
def
log_non_default_args
(
args
:
Namespace
|
EngineArgs
):
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
non_default_args
=
{}
non_default_args
=
{}
# Handle Namespace
# Handle Namespace
...
@@ -255,7 +264,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
...
@@ -255,7 +264,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
def
should_include_usage
(
def
should_include_usage
(
stream_options
:
StreamOptions
|
None
,
enable_force_include_usage
:
bool
stream_options
:
"
StreamOptions | None
"
,
enable_force_include_usage
:
bool
)
->
tuple
[
bool
,
bool
]:
)
->
tuple
[
bool
,
bool
]:
if
stream_options
:
if
stream_options
:
include_usage
=
stream_options
.
include_usage
or
enable_force_include_usage
include_usage
=
stream_options
.
include_usage
or
enable_force_include_usage
...
@@ -270,6 +279,8 @@ def should_include_usage(
...
@@ -270,6 +279,8 @@ def should_include_usage(
def
process_lora_modules
(
def
process_lora_modules
(
args_lora_modules
:
list
[
LoRAModulePath
],
default_mm_loras
:
dict
[
str
,
str
]
|
None
args_lora_modules
:
list
[
LoRAModulePath
],
default_mm_loras
:
dict
[
str
,
str
]
|
None
)
->
list
[
LoRAModulePath
]:
)
->
list
[
LoRAModulePath
]:
from
vllm.entrypoints.openai.serving_models
import
LoRAModulePath
lora_modules
=
args_lora_modules
lora_modules
=
args_lora_modules
if
default_mm_loras
:
if
default_mm_loras
:
default_mm_lora_paths
=
[
default_mm_lora_paths
=
[
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment