Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
e82bc4ec
Unverified
Commit
e82bc4ec
authored
Jul 28, 2025
by
ptarasiewiczNV
Committed by
GitHub
Jul 28, 2025
Browse files
chore: update vLLM to 0.10.0 (#2114)
Co-authored-by:
alec-flowers
<
aflowers@nvidia.com
>
parent
615580d8
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
27 additions
and
25 deletions
+27
-25
components/backends/vllm/src/dynamo/vllm/args.py
components/backends/vllm/src/dynamo/vllm/args.py
+5
-1
components/backends/vllm/src/dynamo/vllm/handlers.py
components/backends/vllm/src/dynamo/vllm/handlers.py
+2
-0
components/backends/vllm/src/dynamo/vllm/publisher.py
components/backends/vllm/src/dynamo/vllm/publisher.py
+5
-1
container/Dockerfile.vllm
container/Dockerfile.vllm
+10
-18
container/deps/vllm/install_vllm.sh
container/deps/vllm/install_vllm.sh
+3
-3
pyproject.toml
pyproject.toml
+1
-1
tests/serve/test_vllm.py
tests/serve/test_vllm.py
+1
-1
No files found.
components/backends/vllm/src/dynamo/vllm/args.py
View file @
e82bc4ec
...
@@ -207,7 +207,11 @@ def overwrite_args(config):
...
@@ -207,7 +207,11 @@ def overwrite_args(config):
defaults
=
{
defaults
=
{
"task"
:
"generate"
,
"task"
:
"generate"
,
"skip_tokenizer_init"
:
True
,
# As of vLLM >=0.10.0 the engine unconditionally calls
# `sampling_params.update_from_tokenizer(...)`, so we can no longer
# skip tokenizer initialisation. Setting this to **False** avoids
# a NoneType error when the processor accesses the tokenizer.
"skip_tokenizer_init"
:
False
,
"disable_log_requests"
:
True
,
"disable_log_requests"
:
True
,
# KV routing relies on logging KV metrics
# KV routing relies on logging KV metrics
"disable_log_stats"
:
False
,
"disable_log_stats"
:
False
,
...
...
components/backends/vllm/src/dynamo/vllm/handlers.py
View file @
e82bc4ec
...
@@ -110,6 +110,8 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -110,6 +110,8 @@ class DecodeWorkerHandler(BaseWorkerHandler):
prompt
=
TokensPrompt
(
prompt_token_ids
=
request
[
"token_ids"
])
prompt
=
TokensPrompt
(
prompt_token_ids
=
request
[
"token_ids"
])
sampling_params
=
SamplingParams
(
**
self
.
default_sampling_params
)
sampling_params
=
SamplingParams
(
**
self
.
default_sampling_params
)
sampling_params
.
detokenize
=
False
for
key
,
value
in
request
[
"sampling_options"
].
items
():
for
key
,
value
in
request
[
"sampling_options"
].
items
():
if
value
is
not
None
and
hasattr
(
sampling_params
,
key
):
if
value
is
not
None
and
hasattr
(
sampling_params
,
key
):
setattr
(
sampling_params
,
key
,
value
)
setattr
(
sampling_params
,
key
,
value
)
...
...
components/backends/vllm/src/dynamo/vllm/publisher.py
View file @
e82bc4ec
...
@@ -25,6 +25,7 @@ class NullStatLogger(StatLoggerBase):
...
@@ -25,6 +25,7 @@ class NullStatLogger(StatLoggerBase):
self
,
self
,
scheduler_stats
:
Optional
[
SchedulerStats
],
scheduler_stats
:
Optional
[
SchedulerStats
],
iteration_stats
:
Optional
[
IterationStats
],
iteration_stats
:
Optional
[
IterationStats
],
engine_idx
:
int
=
0
,
):
):
pass
pass
...
@@ -51,7 +52,10 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
...
@@ -51,7 +52,10 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
self
.
request_total_slots
=
request_total_slots
self
.
request_total_slots
=
request_total_slots
def
record
(
def
record
(
self
,
scheduler_stats
:
SchedulerStats
,
iteration_stats
:
Optional
[
IterationStats
]
self
,
scheduler_stats
:
SchedulerStats
,
iteration_stats
:
Optional
[
IterationStats
],
engine_idx
:
int
=
0
,
):
):
# request_total_slots and kv_total_blocks are properties of model + gpu
# request_total_slots and kv_total_blocks are properties of model + gpu
# we should only publish them once, not every metric update
# we should only publish them once, not every metric update
...
...
container/Dockerfile.vllm
View file @
e82bc4ec
...
@@ -10,16 +10,15 @@ ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
...
@@ -10,16 +10,15 @@ ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
ARG RELEASE_BUILD
ARG RELEASE_BUILD
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG VLLM_REF="059d4cd"
ARG TORCH_BACKEND="cu128"
# After this commit deepgemm API changed
# 1.0.0 -> 2.0.0
ARG DEEPGEMM_REF="03d0be3"
ARG FLASHINF_REF="1d72ed4"
# Make sure to update the dependency version in pyproject.toml when updating this
# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_VERSION="0.9.2"
ARG VLLM_REF="v0.10.0"
ARG TORCH_BACKEND="cu128"
# Match 0.10.0 vLLM release
# https://github.com/vllm-project/vllm/releases/tag/v0.10.0
ARG DEEPGEMM_REF="1876566"
ARG FLASHINF_REF="v0.2.8rc1"
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
# ARCH: Used for package suffixes (e.g., amd64, arm64)
# ARCH: Used for package suffixes (e.g., amd64, arm64)
...
@@ -42,11 +41,10 @@ ARG ARCH_ALT=x86_64
...
@@ -42,11 +41,10 @@ ARG ARCH_ALT=x86_64
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base
# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND
, VLLM_VERSION
so they're available in this stage
# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND so they're available in this stage
ARG ARCH
ARG ARCH
ARG ARCH_ALT
ARG ARCH_ALT
ARG TORCH_BACKEND
ARG TORCH_BACKEND
ARG VLLM_VERSION
USER root
USER root
ARG PYTHON_VERSION=3.12
ARG PYTHON_VERSION=3.12
...
@@ -195,15 +193,11 @@ ENV CUDA_HOME=/usr/local/cuda
...
@@ -195,15 +193,11 @@ ENV CUDA_HOME=/usr/local/cuda
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/uv \
if [ "$ARCH" = "arm64" ]; then \
# TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
# TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
# Should be able to select how you want your build to go
# Should be able to select how you want your build to go
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND; \
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND;
else \
uv pip install "vllm==${VLLM_VERSION}"; \
fi
ENV LD_LIBRARY_PATH=\
ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
...
@@ -464,9 +458,7 @@ COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
...
@@ -464,9 +458,7 @@ COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
# Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries
# Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries
RUN if [ "$ARCH" = "arm64" ]; then \
COPY --from=base /opt/vllm /opt/vllm
COPY --from=base /opt/vllm /opt/vllm; \
fi
ENV LD_LIBRARY_PATH=\
ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
...
...
container/deps/vllm/install_vllm.sh
View file @
e82bc4ec
...
@@ -20,12 +20,12 @@ set -euo pipefail
...
@@ -20,12 +20,12 @@ set -euo pipefail
# Parse arguments
# Parse arguments
EDITABLE
=
true
EDITABLE
=
true
VLLM_REF
=
"
059d4cd
"
VLLM_REF
=
"
v0.10.0
"
MAX_JOBS
=
16
MAX_JOBS
=
16
INSTALLATION_DIR
=
/tmp
INSTALLATION_DIR
=
/tmp
ARCH
=
$(
uname
-m
)
ARCH
=
$(
uname
-m
)
DEEPGEMM_REF
=
"
6c9558e
"
DEEPGEMM_REF
=
"
1876566
"
FLASHINF_REF
=
"
1d72ed4
"
FLASHINF_REF
=
"
v0.2.8rc1
"
TORCH_BACKEND
=
"cu128"
TORCH_BACKEND
=
"cu128"
# Convert x86_64 to amd64 for consistency with Docker ARG
# Convert x86_64 to amd64 for consistency with Docker ARG
...
...
pyproject.toml
View file @
e82bc4ec
...
@@ -67,7 +67,7 @@ trtllm =[
...
@@ -67,7 +67,7 @@ trtllm =[
vllm
=
[
vllm
=
[
"uvloop"
,
"uvloop"
,
"nixl"
,
"nixl"
,
"vllm==0.
9.2
"
,
"vllm==0.
10.0
"
,
]
]
sglang
=
[
sglang
=
[
...
...
tests/serve/test_vllm.py
View file @
e82bc4ec
...
@@ -59,7 +59,7 @@ class VLLMConfig:
...
@@ -59,7 +59,7 @@ class VLLMConfig:
endpoints
:
List
[
str
]
endpoints
:
List
[
str
]
response_handlers
:
List
[
Callable
[[
Any
],
str
]]
response_handlers
:
List
[
Callable
[[
Any
],
str
]]
model
:
str
model
:
str
timeout
:
int
=
6
0
timeout
:
int
=
12
0
delayed_start
:
int
=
0
delayed_start
:
int
=
0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment