chore(deps): bump vLLM to 0.16.0 (#6614)

Signed-off-by: alec-flowers <aflowers@nvidia.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>

chore(deps): bump vLLM to 0.16.0 (#6614)
Signed-off-by: alec-flowers <aflowers@nvidia.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
64574a3e · Alec · GitHub · 3659b76a · 64574a3e · 64574a3e
Unverified Commit 64574a3e authored Feb 26, 2026 by Alec Committed by GitHub Feb 26, 2026
9 changed files
--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -487,7 +487,9 @@ def setup_vllm_engine(config, stat_logger=None):
                "Continuing without KV event consolidation. "
                "Ensure 'kvbm' package is installed if this feature is needed."
            )
-    vllm_config.consolidator_endpoints = consolidator_endpoints
+    # Store consolidator endpoints in additional_config (vLLM 0.16+ uses strict
+    # dataclass fields; monkey-patching attributes onto VllmConfig is no longer safe).
+    vllm_config.additional_config["consolidator_endpoints"] = consolidator_endpoints
    factory = []
    if stat_logger:
@@ -646,13 +648,11 @@ async def init_prefill(
    consolidator_enabled = False
    consolidator_port = None
-    if (
+    _consolidator_eps = vllm_config.additional_config.get("consolidator_endpoints")
-        hasattr(vllm_config, "consolidator_endpoints")
+    if _consolidator_eps:
-        and vllm_config.consolidator_endpoints
-    ):
        # Extract connect endpoint (third element) for clients to subscribe
        # consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint)
-        consolidator_output_endpoint = vllm_config.consolidator_endpoints[2]
+        consolidator_output_endpoint = _consolidator_eps[2]
        consolidator_port = int(consolidator_output_endpoint.split(":")[-1])
        consolidator_enabled = True
@@ -831,13 +831,11 @@ async def init(
    consolidator_enabled = False
    consolidator_port = None
-    if (
+    _consolidator_eps = vllm_config.additional_config.get("consolidator_endpoints")
-        hasattr(vllm_config, "consolidator_endpoints")
+    if _consolidator_eps:
-        and vllm_config.consolidator_endpoints
-    ):
        # Extract connect endpoint (third element) for clients to subscribe
        # consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint)
-        consolidator_output_endpoint = vllm_config.consolidator_endpoints[2]
+        consolidator_output_endpoint = _consolidator_eps[2]
        consolidator_port = int(consolidator_output_endpoint.split(":")[-1])
        consolidator_enabled = True

--- a/components/src/dynamo/vllm/omni/base_handler.py
+++ b/components/src/dynamo/vllm/omni/base_handler.py
@@ -27,7 +27,6 @@ class BaseOmniHandler(BaseWorkerHandler):
    def __init__(
        self,
        runtime,
-        component,
        config,
        default_sampling_params: Dict[str, Any],
        shutdown_event: asyncio.Event | None = None,
@@ -36,7 +35,6 @@ class BaseOmniHandler(BaseWorkerHandler):
        Args:
            runtime: Dynamo distributed runtime.
-            component: Dynamo component handle.
            config: Parsed Config object from args.py.
            default_sampling_params: Default sampling parameters dict.
            shutdown_event: Optional asyncio event for graceful shutdown.
@@ -56,7 +54,6 @@ class BaseOmniHandler(BaseWorkerHandler):
        # TODO: Kv publishers not supported yet
        # TODO: Adopt to baseworker initialization pattern
        self.runtime = runtime
-        self.component = component
        self.default_sampling_params = default_sampling_params
        self.config = config
        self.model_max_len = config.engine_args.max_model_len

--- a/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
@@ -144,7 +144,6 @@ class TestVllmRendererApi:
            "role",
            "content",
            "reasoning",
-            "reasoning_content",
            "tool_calls",
        }
        actual_fields = set(DeltaMessage.model_fields)
@@ -376,6 +375,7 @@ class TestVllmRendererApi:
            "trace_headers",
            "resumable",
            "external_req_id",
+            "reasoning_ended",
        )
        # vllm-omni monkey-patches EngineCoreRequest with an extra field
        # (only installed on amd64, not arm64)
@@ -401,6 +401,7 @@ class TestVllmRendererApi:
            "kv_transfer_params",
            "trace_headers",
            "num_cached_tokens",
+            "num_external_computed_tokens",
            "routed_experts",
            "num_nans_in_logits",
        )

--- a/container/context.yaml
+++ b/container/context.yaml
@@ -42,10 +42,10 @@ vllm:
  cuda13.0:
    base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
    runtime_image_tag: 13.0.2-runtime-ubuntu24.04
-  vllm_ref: v0.15.1
+  vllm_ref: v0.16.0
-  flashinf_ref: v0.6.1
+  flashinf_ref: v0.6.3
-  lmcache_ref: 0.3.13
+  lmcache_ref: 0.3.14
-  vllm_omni_ref: "0.14.0"
+  vllm_omni_ref: "v0.16.0rc1"
  max_jobs: "10"
  enable_media_ffmpeg: "false"
  enable_gpu_memory_service: "true"

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -12,7 +12,7 @@
 set -euo pipefail
-VLLM_VER="0.15.1"
+VLLM_VER="0.16.0"
 VLLM_REF="v${VLLM_VER}"
 # Basic Configurations
@@ -24,9 +24,9 @@ INSTALLATION_DIR=/tmp
 TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
 DEEPGEMM_REF=""
 CUDA_VERSION="12.9"
-FLASHINF_REF="v0.6.1"
+FLASHINF_REF="v0.6.3"
-LMCACHE_REF="0.3.13"
+LMCACHE_REF="0.3.14"
-VLLM_OMNI_REF="0.14.0"
+VLLM_OMNI_REF="v0.16.0rc1"
 while [[ $# -gt 0 ]]; do
    case $1 in
@@ -146,25 +146,46 @@ echo "✓ vLLM repository cloned"
 echo "\n=== Installing vLLM & FlashInfer ==="
+# Build GitHub release wheel URL per CUDA version
+# CUDA 12 wheels have no +cu suffix and use manylinux_2_31
+# CUDA 13 wheels have +cu130 suffix and use manylinux_2_35
 if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
-    echo "Installing vLLM $VLLM_REF from PyPI..."
+    VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl"
-    uv pip install vllm[flashinfer,runai]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
+    EXTRA_PIP_ARGS=""
-    uv pip install flashinfer-cubin==$FLASHINF_REF
-    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
 elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
-    echo "⚠ Skipping LMCache on CUDA 13 env since LMCache doesn't support CUDA 13 "
+    VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl"
-    echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..."
+    EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}"
-    uv pip install \
-        --index-strategy=unsafe-best-match \
-        --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
-        https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl[flashinfer,runai] \
-        --torch-backend=${TORCH_BACKEND}
-    uv pip install flashinfer-cubin==$FLASHINF_REF
-    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
 else
    echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
    exit 1
 fi
+VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}"
+# Install vLLM wheel
+# CUDA 12: Try PyPI first, fall back to GitHub release
+# CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend
+#           does not prevent uv from resolving the cu12 variant)
+echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..."
+if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
+    if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then
+        echo "✓ vLLM ${VLLM_VER} installed from PyPI"
+    else
+        echo "⚠ PyPI install failed, installing from GitHub release..."
+        uv pip install ${EXTRA_PIP_ARGS} \
+            "${VLLM_GITHUB_URL}[flashinfer,runai]" \
+            --torch-backend=${TORCH_BACKEND}
+        echo "✓ vLLM ${VLLM_VER} installed from GitHub"
+    fi
+else
+    echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..."
+    uv pip install ${EXTRA_PIP_ARGS} \
+        "${VLLM_GITHUB_URL}[flashinfer,runai]" \
+        --torch-backend=${TORCH_BACKEND}
+    echo "✓ vLLM ${VLLM_VER} installed from GitHub"
+fi
+uv pip install flashinfer-cubin==$FLASHINF_REF
+uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
 echo "✓ vLLM installation completed"
 echo "\n=== Installing vLLM-Omni ==="
@@ -172,10 +193,19 @@ if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
    # Save original vllm entrypoint before vllm-omni overwrites it
    VLLM_BIN=$(which vllm)
    cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup
-    uv pip install vllm-omni==${VLLM_OMNI_REF}
+    # Try PyPI first, fall back to building from source
+    if uv pip install vllm-omni==${VLLM_OMNI_REF#v} 2>&1; then
+        echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from PyPI"
+    else
+        echo "⚠ PyPI install failed, building from source..."
+        git clone --depth 1 --branch ${VLLM_OMNI_REF} https://github.com/vllm-project/vllm-omni.git $INSTALLATION_DIR/vllm-omni
+        uv pip install $INSTALLATION_DIR/vllm-omni
+        rm -rf $INSTALLATION_DIR/vllm-omni
+        echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from source"
+    fi
    # Restore original vllm CLI entrypoint (vllm-omni replaces it with its own)
    cp /tmp/vllm-entrypoint-backup "$VLLM_BIN"
-    echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed (original vllm entrypoint preserved)"
+    echo "✓ Original vllm entrypoint preserved"
 else
    echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
 fi

--- a/docs/pages/backends/vllm/vllm-omni.md
+++ b/docs/pages/backends/vllm/vllm-omni.md
@@ -10,6 +10,14 @@ Dynamo supports multimodal generation through the [vLLM-Omni](https://github.com
 This guide assumes familiarity with deploying Dynamo with vLLM as described in the [vLLM backend guide](/docs/pages/backends/vllm/README.md).
+### Installation
+Dynamo container images include vLLM-Omni pre-installed. If you are using `pip install ai-dynamo[vllm]`, vLLM-Omni is **not** included automatically because the matching release is not yet available on PyPI. Install it separately from source:
+```bash
+pip install git+https://github.com/vllm-project/vllm-omni.git@v0.16.0rc1
+```
 ## Supported Modalities
 | Modality | Endpoint(s) | `--output-modalities` |

--- a/docs/pages/reference/support-matrix.md
+++ b/docs/pages/reference/support-matrix.md
@@ -14,7 +14,7 @@ The following table shows the backend framework versions included with each Dyna
 | **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
 | :--- | :--- | :--- | :--- | :--- |
-| **main (ToT)** | `0.5.9` | `1.3.0rc5` | `0.15.1` | `0.10.0` |
+| **main (ToT)** | `0.5.9` | `1.3.0rc5` | `0.16.0` | `0.10.0` |
 | **v1.0.0** *(in progress)* | `0.5.9` | `1.3.0rc5` | `0.15.1` | `0.10.1` |
 | **v0.9.1** | `0.5.8` | `1.3.0rc3` | `0.14.1` | `0.9.0` |
 | **v0.9.0** | `0.5.8` | `1.3.0rc1` | `0.14.1` | `0.9.0` |

--- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_leader.py
+++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_leader.py
@@ -74,10 +74,8 @@ class KvConnectorLeader:
        consolidator_output_endpoint = None
        self._consolidator_output_port = None
-        if (
+        _consolidator_eps = vllm_config.additional_config.get("consolidator_endpoints")
-            hasattr(vllm_config, "consolidator_endpoints")
+        if _consolidator_eps:
-            and vllm_config.consolidator_endpoints
-        ):
            # Unpack all three endpoints
            # [0]: vllm_endpoint (for consolidator to subscribe to vLLM)
            # [1]: output_bind_endpoint (for consolidator to bind/publish)
@@ -86,7 +84,7 @@ class KvConnectorLeader:
                consolidator_vllm_endpoint,
                consolidator_output_endpoint,
                _consolidator_output_connect_endpoint,  # Not needed here
-            ) = vllm_config.consolidator_endpoints
+            ) = _consolidator_eps
            self._consolidator_output_port = int(
                consolidator_output_endpoint.split(":")[-1]
            )

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,8 +57,11 @@ trtllm =[
 vllm = [
    "uvloop",
    "nixl[cu12]<=0.9.0",
-    "vllm[flashinfer,runai]==0.15.1",
+    "vllm[flashinfer,runai]==0.16.0",
-    "vllm-omni==0.14.0",
+    # vllm-omni 0.16.0rc1 is not on PyPI; installed from source in container builds
+    # (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
+    # not include vllm-omni — install it separately from source if needed.
+    # "vllm-omni==0.16.0rc1",
    "blake3>=1.0.0,<2.0.0",
 ]
@@ -188,6 +191,7 @@ filterwarnings = [
    "ignore:.*Exception ignored in.*:pytest.PytestUnraisableExceptionWarning", # Ignore unraisable exception warnings
    "ignore:The pynvml package is deprecated.*:FutureWarning", # Ignore pynvml deprecation warning, temporary until upstream library updates to nvidia-ml-py
    "ignore:The behavior of DataFrame concatenation with empty or all-NA entries is deprecated.*:FutureWarning", # pandas 2.x concat deprecation in AIC SDK TODO: fix in AIC
+    "ignore:Automatic KV events configuration is deprecated.*:FutureWarning", # Ignore Dynamo's own KV events deprecation warning in tests
    # Pydantic V2 deprecation warnings from TRTLLM dependencies (raised at import time during collection)
    "ignore:Support for class-based `config`.*:pydantic.warnings.PydanticDeprecatedSince20",
    "ignore:Using extra keyword arguments on `Field`.*:pydantic.warnings.PydanticDeprecatedSince20",