chore: Bump vLLM to 0.18.0 (#7584)

3a1561fb · Alec · GitHub · aaa8a567 · 3a1561fb · 3a1561fb
Unverified Commit 3a1561fb authored Mar 23, 2026 by Alec Committed by GitHub Mar 24, 2026
8 changed files
--- a/container/context.yaml
+++ b/container/context.yaml
@@ -40,13 +40,13 @@ vllm:
    runtime_image: nvcr.io/nvidia/cuda
    base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
    runtime_image_tag: 12.9.1-runtime-ubuntu24.04
-    vllm_ref: v0.17.1
+    vllm_ref: v0.18.0
  cuda13.0:
    base_image: nvcr.io/nvidia/cuda-dl-base
    runtime_image: nvcr.io/nvidia/cuda
    base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
    runtime_image_tag: 13.0.2-runtime-ubuntu24.04
-    vllm_ref: v0.17.1
+    vllm_ref: v0.18.0
  xpu:
    base_image: intel/deep-learning-essentials
    runtime_image: intel/deep-learning-essentials
@@ -59,8 +59,8 @@ vllm:
    base_image_tag: 24.04
    runtime_image_tag: 24.04
    vllm_ref: v0.16.0
-  flashinf_ref: v0.6.4
+  flashinf_ref: v0.6.6
-  lmcache_ref: 0.4.1
+  lmcache_ref: 0.4.2
  vllm_omni_ref: "v0.16.0"
  max_jobs: "10"
  enable_media_ffmpeg: "false"

--- a/container/deps/requirements.test.txt
+++ b/container/deps/requirements.test.txt
@@ -14,7 +14,7 @@ kr8s==0.20.13
 kubernetes_asyncio==32.0.0
 matplotlib==3.10.7
 matplotlib-stubs
-mistral-common==1.9.1
+mistral-common>=1.10.0
 mypy==1.18.2
 # For NATS object store verification in router tests
 nats-py==2.12.0

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -12,7 +12,7 @@
 set -euo pipefail
-VLLM_VER="0.17.1"
+VLLM_VER="0.18.0"
 VLLM_REF="v${VLLM_VER}"
 DEVICE="cuda"
@@ -25,8 +25,8 @@ INSTALLATION_DIR=/tmp
 TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
 DEEPGEMM_REF=""
 CUDA_VERSION="12.9"
-FLASHINF_REF="v0.6.4"
+FLASHINF_REF="v0.6.6"
-LMCACHE_REF="0.4.1"
+LMCACHE_REF="0.4.2"
 VLLM_OMNI_REF="v0.16.0"
 while [[ $# -gt 0 ]]; do
@@ -208,37 +208,6 @@ if [ "$DEVICE" = "cpu" ]; then
 fi
 echo "✓ vLLM installation completed"
-# Apply hotfix for multi-node TP init ordering (vLLM PR #35892).
-# Only applies to vLLM 0.17.1 — fail loudly on any other version so the
-# patch + this block get cleaned up when vLLM is bumped.
-# Note: In Docker builds the script is copied to /tmp but deps are bind-mounted
-# at /tmp/deps, so resolve the patch relative to BASH_SOURCE first, then fall
-# back to the bind-mount path.
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-VLLM_PATCH="${SCRIPT_DIR}/multinode-tp-init-order.patch"
-if [ ! -f "$VLLM_PATCH" ]; then
-    VLLM_PATCH="/tmp/deps/vllm/multinode-tp-init-order.patch"
-fi
-if [ "$VLLM_VER" = "0.17.1" ]; then
-    # Patch the cloned repo (used by CPU/XPU source builds that build from source)
-    echo "Applying vLLM multi-node TP hotfix to cloned repo..."
-    git -C "${INSTALLATION_DIR}/vllm" apply --ignore-whitespace "$VLLM_PATCH"
-    # Also patch site-packages if vLLM was installed from a wheel (CUDA builds).
-    # Skip if the installed location is the clone itself (already patched above).
-    # Use `uv pip show` instead of importing vllm to avoid pulling in torch/CUDA.
-    VLLM_SITE=$(uv pip show vllm 2>/dev/null | grep -i '^Location:' | awk '{print $2}')
-    if [ -n "$VLLM_SITE" ] && [ "$VLLM_SITE" != "${INSTALLATION_DIR}/vllm" ]; then
-        echo "Applying vLLM multi-node TP hotfix to ${VLLM_SITE}..."
-        patch -d "$VLLM_SITE" -p1 < "$VLLM_PATCH"
-    fi
-    echo "✓ vLLM multi-node TP hotfix applied"
-else
-    echo "❌ ERROR: vLLM version is ${VLLM_VER}, not 0.17.1."
-    echo "   The multi-node TP hotfix patch (multinode-tp-init-order.patch) and"
-    echo "   this block in install_vllm.sh are no longer needed — please remove them."
-    exit 1
-fi
 echo "\n=== Installing LMCache from source ==="
 # LMCache prebuilt wheels are built against PyTorch <=2.8.0 and fail with PyTorch 2.10+
 # (undefined symbol: c10::cuda::c10_cuda_check_implementation).

--- a/container/deps/vllm/multinode-tp-init-order.patch
+++ b/container/deps/vllm/multinode-tp-init-order.patch
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# SPDX-License-Identifier: Apache-2.0
-#
-# Hotfix for vLLM 0.17.1: multi-node TP init ordering
-# Upstream fix: https://github.com/vllm-project/vllm/commit/86e1060b
-# Upstream PR:  https://github.com/vllm-project/vllm/pull/35892
-# Issue:        https://github.com/vllm-project/vllm/issues/36389
-#
-# WorkerProc.__init__ calls _init_message_queues() before init_device(),
-# but the former needs _INNER_DP_WORLD which the latter creates.
-# Move _init_message_queues() after init_device()+load_model().
-#
-# Remove this patch once vLLM >= 0.17.2 is adopted.
-diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
-index e3376ba..39a3646 100644
--- a/vllm/v1/executor/multiproc_executor.py
-+++ b/vllm/v1/executor/multiproc_executor.py
-@@ -586,7 +586,6 @@ class WorkerProc:
-         )
-         # Load model
-        self._init_message_queues(input_shm_handle, vllm_config)
-         is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
-         if not is_eep_new_worker:
-             self.worker.init_device()
-@@ -596,6 +595,10 @@ class WorkerProc:
-             )
-             self.worker.load_model()
-+        # Initialize message queues after init_device() since multi-node setups
-+        # (nnodes_within_dp > 1) require distributed groups to be initialized
-+        self._init_message_queues(input_shm_handle, vllm_config)
-+
-         # Enable environment variable cache (e.g. assume no more
-         # environment variable overrides after this point)
-         enable_envs_cache()
--- a/docs/features/multimodal/embedding-cache.md
+++ b/docs/features/multimodal/embedding-cache.md
@@ -23,11 +23,11 @@ If your workload consists entirely of unique images, the cache provides no benef
 | Backend | Aggregated | Disaggregated (E/PD) | Notes |
 |---------|------------|----------------------|-------|
-| **vLLM** | ✅* | ✅ | Aggregated uses vLLM-native `ec_both`; disaggregated uses Dynamo `EmbeddingCacheManager` |
+| **vLLM** | ✅ | ✅ | Aggregated uses vLLM-native `ec_both`; disaggregated uses Dynamo `EmbeddingCacheManager` |
 | **TRT-LLM** | ❌ | ✅ | Dynamo `MultimodalEmbeddingCacheManager` in PD worker |
 | **SGLang** | ❌ | ❌ | Not supported yet |
-*Requires an upcoming version of vLLM that has not yet been released. Support will be available once the new vLLM release is published.
+This support requires vLLM `0.18.0` or newer.
 ## How It Works

--- a/docs/features/multimodal/multimodal-kv-routing.md
+++ b/docs/features/multimodal/multimodal-kv-routing.md
@@ -25,11 +25,11 @@ Without MM-aware routing, the standard router treats image token blocks as opaqu
 | Backend | Supported | Notes |
 |---------|-----------|-------|
-| **vLLM** | ✅* | Requires vLLM with KV events `extra_keys` support ([PR #33304](https://github.com/vllm-project/vllm/pull/33304)) |
+| **vLLM** | ✅ | Requires vLLM with KV events `extra_keys` support ([PR #33304](https://github.com/vllm-project/vllm/pull/33304)) |
 | **TRT-LLM** | ✅ | Requires `--publish-events-and-metrics` on TRT-LLM workers |
 | **SGLang** | ❌ | Not supported yet |
-*Requires an upcoming version of vLLM that has not yet been released. Support will be available once the new vLLM release is published.
+This support requires vLLM `0.18.0` or newer.
 ## How It Works

--- a/docs/reference/support-matrix.md
+++ b/docs/reference/support-matrix.md
@@ -29,7 +29,7 @@ The following table shows the backend framework versions included with each Dyna
 | **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
 | :--- | :--- | :--- | :--- | :--- |
-| **main (ToT)** | `0.5.9` | `1.3.0rc8` | `0.17.1` | `0.10.1` |
+| **main (ToT)** | `0.5.9` | `1.3.0rc8` | `0.18.0` | `0.10.1` |
 | **v1.1.0-dev.1** *(experimental)* | `0.5.9` | `1.3.0rc5.post1` | `0.17.1` | `0.10.1` |
 | **v1.0.1** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
 | **v1.0.0** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ trtllm =[
 vllm = [
    "uvloop",
    "nixl[cu12]<=0.10.1",
-    "vllm[flashinfer,runai,otel]==0.17.1",
+    "vllm[flashinfer,runai,otel]==0.18.0",
    # vllm-omni 0.16.0 is now on PyPI; install only future rc builds from source in container builds
    # (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
    # not include vllm-omni — install it separately from source if needed.