chore: update vLLM to 0.10.0 (#2114)

Co-authored-by: alec-flowers <aflowers@nvidia.com>

chore: update vLLM to 0.10.0 (#2114)
Co-authored-by: alec-flowers <aflowers@nvidia.com>
e82bc4ec · ptarasiewiczNV · GitHub · 615580d8 · e82bc4ec · e82bc4ec
Unverified Commit e82bc4ec authored Jul 28, 2025 by ptarasiewiczNV Committed by GitHub Jul 28, 2025
7 changed files
--- a/components/backends/vllm/src/dynamo/vllm/args.py
+++ b/components/backends/vllm/src/dynamo/vllm/args.py
@@ -207,7 +207,11 @@ def overwrite_args(config):
    defaults = {
        "task": "generate",
-        "skip_tokenizer_init": True,
+        # As of vLLM >=0.10.0 the engine unconditionally calls
+        # `sampling_params.update_from_tokenizer(...)`, so we can no longer
+        # skip tokenizer initialisation.  Setting this to **False** avoids
+        # a NoneType error when the processor accesses the tokenizer.
+        "skip_tokenizer_init": False,
        "disable_log_requests": True,
        # KV routing relies on logging KV metrics
        "disable_log_stats": False,

--- a/components/backends/vllm/src/dynamo/vllm/handlers.py
+++ b/components/backends/vllm/src/dynamo/vllm/handlers.py
@@ -110,6 +110,8 @@ class DecodeWorkerHandler(BaseWorkerHandler):
        prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
        sampling_params = SamplingParams(**self.default_sampling_params)
+        sampling_params.detokenize = False
        for key, value in request["sampling_options"].items():
            if value is not None and hasattr(sampling_params, key):
                setattr(sampling_params, key, value)

--- a/components/backends/vllm/src/dynamo/vllm/publisher.py
+++ b/components/backends/vllm/src/dynamo/vllm/publisher.py
@@ -25,6 +25,7 @@ class NullStatLogger(StatLoggerBase):
        self,
        scheduler_stats: Optional[SchedulerStats],
        iteration_stats: Optional[IterationStats],
+        engine_idx: int = 0,
    ):
        pass
@@ -51,7 +52,10 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
        self.request_total_slots = request_total_slots
    def record(
-        self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]
+        self,
+        scheduler_stats: SchedulerStats,
+        iteration_stats: Optional[IterationStats],
+        engine_idx: int = 0,
    ):
        # request_total_slots and kv_total_blocks are properties of model + gpu
        # we should only publish them once, not every metric update

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -10,16 +10,15 @@ ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
 ARG RELEASE_BUILD
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
 ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
-ARG VLLM_REF="059d4cd"
-ARG TORCH_BACKEND="cu128"
-# After this commit deepgemm API changed
-# 1.0.0 -> 2.0.0
-ARG DEEPGEMM_REF="03d0be3"
-ARG FLASHINF_REF="1d72ed4"
 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_VERSION="0.9.2"
+ARG VLLM_REF="v0.10.0"
+ARG TORCH_BACKEND="cu128"
+# Match 0.10.0 vLLM release
+# https://github.com/vllm-project/vllm/releases/tag/v0.10.0
+ARG DEEPGEMM_REF="1876566"
+ARG FLASHINF_REF="v0.2.8rc1"
 # Define general architecture ARGs for supporting both x86 and aarch64 builds.
 #   ARCH: Used for package suffixes (e.g., amd64, arm64)
@@ -42,11 +41,10 @@ ARG ARCH_ALT=x86_64
 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base
-# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND, VLLM_VERSION so they're available in this stage
+# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND so they're available in this stage
 ARG ARCH
 ARG ARCH_ALT
 ARG TORCH_BACKEND
-ARG VLLM_VERSION
 USER root
 ARG PYTHON_VERSION=3.12
@@ -195,15 +193,11 @@ ENV CUDA_HOME=/usr/local/cuda
 RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
    --mount=type=cache,target=/root/.cache/uv \
-    if [ "$ARCH" = "arm64" ]; then \
        # TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
        # Should be able to select how you want your build to go
        cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
        chmod +x /tmp/install_vllm.sh && \
-        /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND; \
+        /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND;
-    else \
-        uv pip install "vllm==${VLLM_VERSION}"; \
-    fi
 ENV LD_LIBRARY_PATH=\
 /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
@@ -464,9 +458,7 @@ COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
 COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
 # Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries
-RUN if [ "$ARCH" = "arm64" ]; then \
+COPY --from=base /opt/vllm /opt/vllm
-        COPY --from=base /opt/vllm /opt/vllm; \
-    fi
 ENV LD_LIBRARY_PATH=\
 /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -20,12 +20,12 @@ set -euo pipefail
 # Parse arguments
 EDITABLE=true
-VLLM_REF="059d4cd"
+VLLM_REF="v0.10.0"
 MAX_JOBS=16
 INSTALLATION_DIR=/tmp
 ARCH=$(uname -m)
-DEEPGEMM_REF="6c9558e"
+DEEPGEMM_REF="1876566"
-FLASHINF_REF="1d72ed4"
+FLASHINF_REF="v0.2.8rc1"
 TORCH_BACKEND="cu128"
 # Convert x86_64 to amd64 for consistency with Docker ARG

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,7 +67,7 @@ trtllm =[
 vllm = [
    "uvloop",
    "nixl",
-    "vllm==0.9.2",
+    "vllm==0.10.0",
 ]
 sglang = [

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -59,7 +59,7 @@ class VLLMConfig:
    endpoints: List[str]
    response_handlers: List[Callable[[Any], str]]
    model: str
-    timeout: int = 60
+    timeout: int = 120
    delayed_start: int = 0