chore(deps): bump vLLM 0.16.0 → 0.17.1 (#7170)

Signed-off-by: Tzu-Ling <tzulingk@nvidia.com>

chore(deps): bump vLLM 0.16.0 → 0.17.1 (#7170)
Signed-off-by: Tzu-Ling <tzulingk@nvidia.com>
dba69e0f · Tzu-Ling Kan · GitHub · 17db1b6a · dba69e0f · dba69e0f
Unverified Commit dba69e0f authored Mar 12, 2026 by Tzu-Ling Kan Committed by GitHub Mar 12, 2026
14 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,7 +26,7 @@ repos:
  - id: black
    types_or: [python, cython]
 - repo: https://github.com/PyCQA/flake8
-  rev: 5.0.4
+  rev: 7.3.0  # 5.0.4 crashes on Python 3.12+ (ast.Str removed)
  hooks:
  - id: flake8
    args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]

--- a/components/src/dynamo/frontend/vllm_processor.py
+++ b/components/src/dynamo/frontend/vllm_processor.py
@@ -17,6 +17,7 @@ from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig
 from vllm.inputs.data import TokensPrompt
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.tasks import GENERATION_TASKS
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
@@ -85,6 +86,21 @@ class VllmProcessor:
        self.tool_parser_class = tool_parser_class
        self.reasoning_parser_class = reasoning_parser_class
+    def _get_eos_token_ids(self) -> list[int]:
+        """Return EOS token ids using tokenizer metadata.
+        vLLM 0.17.0 removed EngineCoreRequest.eos_token_id, so Dynamo can no
+        longer read EOS ids from the preprocessed request object.
+        """
+        eos_token_ids = getattr(self.tokenizer, "eos_token_ids", None)
+        if eos_token_ids is not None and not isinstance(eos_token_ids, int):
+            return list(eos_token_ids)
+        eos_token_id = getattr(self.tokenizer, "eos_token_id", None)
+        if eos_token_id is None:
+            return []
+        return [eos_token_id]
    # Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but
    # it has a lot of fields.
    # request: dynamo.NVCreateChatCompletionRequest
@@ -130,7 +146,11 @@ class VllmProcessor:
            max_tokens=max_tokens,
        )
        # generation_config.json
+        # Skip eos_token_id: vLLM 0.17.0 made SamplingParams.eos_token_id a
+        # read-only property; eos tokens are handled via eos_token_ids below.
        for k, v in self.input_processor.generation_config_fields.items():
+            if k == "eos_token_id":
+                continue
            if hasattr(sampling_params, k):
                setattr(sampling_params, k, v)
@@ -174,17 +194,13 @@ class VllmProcessor:
            request_id,
            prompt_inputs,
            sampling_params,
-            # arrival_time: float | None = None,
+            GENERATION_TASKS,  # vLLM 0.17.0: required supported_tasks arg
-            # lora_request: LoRARequest | None = None,
-            # tokenization_kwargs: dict[str, Any] | None = None,
-            # trace_headers: Mapping[str, str] | None = None,
-            # priority: int = 0,
-            # data_parallel_rank: int | None = None,
        )
        InputProcessor.assign_request_id(vllm_preproc)
-        # Processed: EngineCoreRequest(request_id='a2b76a85cd65e151', prompt_token_ids=[3838, 374, 279, 6722, 315, 28649, 25510, 30], mm_features=None, sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None), pooling_params=None, eos_token_id=151645, arrival_time=1769036937.9417946, lora_request=None, cache_salt=None, data_parallel_rank=None, prompt_embeds=None, client_index=0, current_wave=0, priority=0, trace_headers=None)
+        # vLLM 0.17.0 removed EngineCoreRequest.eos_token_id. Dynamo now uses
+        # tokenizer metadata for EOS ids when constructing the router payload.
        # Convert to a Python object that has fields that match our PreprocessedRequest
        sp = vllm_preproc.sampling_params
@@ -229,11 +245,7 @@ class VllmProcessor:
                "prompt_logprobs": sp.prompt_logprobs,
                "skip_special_tokens": sp.skip_special_tokens,
            },
-            "eos_token_ids": (
+            "eos_token_ids": self._get_eos_token_ids(),
-                [vllm_preproc.eos_token_id]
-                if vllm_preproc.eos_token_id is not None
-                else []
-            ),
            "annotations": [],
        }

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -13,7 +13,6 @@ import uvloop
 from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import ZmqEventPublisher
-from vllm.entrypoints.cli.serve import run_headless
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
@@ -91,6 +90,10 @@ def run_dynamo_headless(config: Config) -> None:
    Secondary nodes spawn vLLM workers only — no engine core, no scheduler,
    no Dynamo endpoints. Bypasses DistributedRuntime entirely (no NATS/etcd).
    """
+    # Keep the upstream CLI import local so tests that only exercise
+    # build_headless_namespace() do not pull in vLLM's full CLI import graph.
+    from vllm.entrypoints.cli.serve import run_headless
    args = build_headless_namespace(config)
    run_headless(args)

--- a/components/src/dynamo/vllm/tests/test_vllm_engine_monitor_stats.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_engine_monitor_stats.py
@@ -14,6 +14,7 @@ pytestmark = [
    pytest.mark.unit,
    pytest.mark.vllm,
    pytest.mark.pre_merge,
+    pytest.mark.gpu_0,
 ]

--- a/components/src/dynamo/vllm/tests/test_vllm_kv_events_api.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_kv_events_api.py
@@ -9,9 +9,18 @@ These tests check that the vLLM KV events classes have the expected fields
 that our Rust deserializers depend on. If vLLM changes their API, these tests
 will fail early, before hitting runtime deserialization errors.
-The Rust code in kv_router/publisher.rs and kv_consolidator/subscriber.rs
+This test is the early warning for vLLM KV-event wire-format changes.
-deserializes vLLM's msgpack-encoded KV events. Since vLLM uses msgspec with
-array_like=True, the field ORDER matters - fields are serialized positionally.
+In the normal case, if this fails, update `lib/kv-router/src/zmq_wire.rs` to
+match the new upstream vLLM event shape, then update this test.
+That file is Dynamo's compatibility layer for vLLM KV events:
+- it decodes vLLM's msgpack `array_like=True` wire format
+- it handles field order changes in `BlockStored` / `BlockRemoved` / `EventBatch`
+- it translates upstream `extra_keys` into Dynamo's internal `block_mm_infos`
+Only touch consolidator files if we explicitly need the consolidator publisher
+to preserve and republish a new upstream field.
 """
 import importlib
@@ -51,6 +60,7 @@ class TestVllmKvEventsApi:
        5. lora_id
        6. medium
        7. lora_name (added in vLLM 0.14.0)
+        8. extra_keys (added in vLLM 0.17.0)
        If vLLM adds/removes/reorders fields, this test will fail.
        """
@@ -62,6 +72,7 @@ class TestVllmKvEventsApi:
            "lora_id",
            "medium",
            "lora_name",
+            "extra_keys",
        )
        actual_fields = BlockStored.__struct_fields__
@@ -69,9 +80,10 @@ class TestVllmKvEventsApi:
            f"BlockStored fields changed!\n"
            f"Expected: {expected_fields}\n"
            f"Actual:   {actual_fields}\n"
-            f"If vLLM changed the API, update the Rust deserializers in:\n"
+            f"Required follow-up:\n"
-            f"  - lib/llm/src/kv_router/publisher.rs (RawKvEvent::BlockStored)\n"
+            f"  - Update lib/kv-router/src/zmq_wire.rs to match the new BlockStored wire format.\n"
-            f"  - lib/llm/src/block_manager/kv_consolidator/subscriber.rs (VllmRawEvent::BlockStored)"
+            f"  - Update this test's expected_fields and msgpack position checks.\n"
+            f"  - If needed, add or update a regression test in lib/llm/src/kv_router/publisher.rs."
        )
    def test_block_removed_fields(self):
@@ -86,7 +98,9 @@ class TestVllmKvEventsApi:
            f"BlockRemoved fields changed!\n"
            f"Expected: {expected_fields}\n"
            f"Actual:   {actual_fields}\n"
-            f"If vLLM changed the API, update the Rust deserializers."
+            f"Required follow-up:\n"
+            f"  - Update lib/kv-router/src/zmq_wire.rs RawKvEvent::BlockRemoved seq deserializer.\n"
+            f"  - Update this test's expected_fields."
        )
    def test_event_batch_fields(self):
@@ -101,7 +115,11 @@ class TestVllmKvEventsApi:
        assert actual_fields == expected_fields, (
            f"EventBatch fields changed!\n"
            f"Expected: {expected_fields}\n"
-            f"Actual:   {actual_fields}"
+            f"Actual:   {actual_fields}\n"
+            f"Required follow-up:\n"
+            f"  - Update lib/kv-router/src/zmq_wire.rs KvEventBatch Deserialize impl.\n"
+            f"  - Update subscriber.rs VllmEventBatch tuple if batch field order changes.\n"
+            f"  - Update this test's expected_fields."
        )
    def test_kv_cache_event_uses_array_like(self):
@@ -148,6 +166,7 @@ class TestVllmKvEventsApi:
            lora_id=None,
            medium="GPU",
            lora_name=None,
+            extra_keys=None,
        )
        encoded = msgspec.msgpack.encode(event)
@@ -159,9 +178,9 @@ class TestVllmKvEventsApi:
            decoded[0] == "BlockStored"
        ), f"Expected tag 'BlockStored', got {decoded[0]}"
-        # Verify field count (tag + 7 fields = 8 elements)
+        # Verify field count (tag + 8 fields = 9 elements)
-        assert len(decoded) == 8, (
+        assert len(decoded) == 9, (
-            f"Expected 8 elements (tag + 7 fields), got {len(decoded)}.\n"
+            f"Expected 9 elements (tag + 8 fields), got {len(decoded)}.\n"
            f"Decoded: {decoded}\n"
            f"If field count changed, update Rust deserializers."
        )
@@ -174,3 +193,4 @@ class TestVllmKvEventsApi:
        assert decoded[5] is None, f"lora_id at wrong position: {decoded[5]}"
        assert decoded[6] == "GPU", f"medium at wrong position: {decoded[6]}"
        assert decoded[7] is None, f"lora_name at wrong position: {decoded[7]}"
+        assert decoded[8] is None, f"extra_keys at wrong position: {decoded[8]}"
--- a/components/src/dynamo/vllm/tests/test_vllm_logging.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_logging.py
@@ -37,6 +37,7 @@ pytestmark = [
    pytest.mark.unit,
    pytest.mark.vllm,
    pytest.mark.pre_merge,
+    pytest.mark.gpu_0,
 ]

--- a/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
@@ -271,8 +271,9 @@ class TestVllmRendererApi:
        input_processor.renderer to preprocess_chat_request.
        VllmProcessor iterates input_processor.generation_config_fields.
        """
-        assert hasattr(InputProcessor, "renderer"), (
+        init_source = inspect.getsource(InputProcessor.__init__)
-            "InputProcessor no longer has 'renderer' attribute/property; "
+        assert "self.renderer" in init_source, (
+            "InputProcessor.__init__ no longer initializes 'renderer'; "
            "update preprocess_chat_request call in "
            "components/src/dynamo/frontend/vllm_processor.py"
        )
@@ -363,7 +364,6 @@ class TestVllmRendererApi:
            "mm_features",
            "sampling_params",
            "pooling_params",
-            "eos_token_id",
            "arrival_time",
            "lora_request",
            "cache_salt",

--- a/container/context.yaml
+++ b/container/context.yaml
@@ -40,22 +40,22 @@ vllm:
    runtime_image: nvcr.io/nvidia/cuda
    base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
    runtime_image_tag: 12.9.1-runtime-ubuntu24.04
-    vllm_ref: v0.16.0
+    vllm_ref: v0.17.1
  cuda13.0:
    base_image: nvcr.io/nvidia/cuda-dl-base
    runtime_image: nvcr.io/nvidia/cuda
    base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
    runtime_image_tag: 13.0.2-runtime-ubuntu24.04
-    vllm_ref: v0.16.0
+    vllm_ref: v0.17.1
  xpu:
    base_image: intel/deep-learning-essentials
    runtime_image: intel/deep-learning-essentials
    base_image_tag: 2025.3.2-0-devel-ubuntu24.04
    runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04
    vllm_ref: v0.14.0
-  flashinf_ref: v0.6.3
+  flashinf_ref: v0.6.4
-  lmcache_ref: 0.3.14
+  lmcache_ref: 0.4.1
-  vllm_omni_ref: "v0.16.0rc1"
+  vllm_omni_ref: "v0.16.0"
  max_jobs: "10"
  enable_media_ffmpeg: "false"
  enable_gpu_memory_service: "true"

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -4,15 +4,15 @@
 # This script installs vLLM and its dependencies from PyPI (release versions only).
 # Installation order:
-# 1. LMCache (installed first so vLLM's dependencies take precedence)
+# 1. vLLM
-# 2. vLLM
+# 2. LMCache (built from source AFTER vLLM so c_ops.so is compiled against installed PyTorch)
 # 3. vLLM-Omni
 # 4. DeepGEMM
 # 5. EP kernels
 set -euo pipefail
-VLLM_VER="0.16.0"
+VLLM_VER="0.17.1"
 VLLM_REF="v${VLLM_VER}"
 DEVICE="cuda"
@@ -25,9 +25,9 @@ INSTALLATION_DIR=/tmp
 TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
 DEEPGEMM_REF=""
 CUDA_VERSION="12.9"
-FLASHINF_REF="v0.6.3"
+FLASHINF_REF="v0.6.4"
-LMCACHE_REF="0.3.14"
+LMCACHE_REF="0.4.1"
-VLLM_OMNI_REF="v0.16.0rc1"
+VLLM_OMNI_REF="v0.16.0"
 while [[ $# -gt 0 ]]; do
    case $1 in
@@ -133,30 +133,6 @@ elif [ "$DEVICE" = "xpu" ]; then
    echo "  VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR"
 fi
-if [ "$DEVICE" = "cuda" ]; then
-    if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
-        echo "  FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
-        echo "\n=== Installing LMCache ==="
-        if [ "$ARCH" = "amd64" ]; then
-            # LMCache installation currently fails on arm64 due to CUDA dependency issues
-            # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
-            uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
-            echo "✓ LMCache ${LMCACHE_REF} installed"
-        else
-            echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
-        fi
-    else
-        echo "  FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
-    fi
-elif [ "$DEVICE" = "xpu" ]; then
-    echo " LMCACHE_REF=$LMCACHE_REF "
-    echo "\n=== Installing LMCache ==="
-    if [ "$ARCH" = "amd64" ]; then
-        uv pip install lmcache==${LMCACHE_REF}
-        echo "✓ LMCache ${LMCACHE_REF} installed"
-    fi
-fi
 echo "\n=== Cloning vLLM repository ==="
 # Clone needed for DeepGEMM and EP kernels install scripts
 cd $INSTALLATION_DIR
@@ -217,6 +193,40 @@ if [ "$DEVICE" = "cuda" ]; then
 fi
 echo "✓ vLLM installation completed"
+echo "\n=== Installing LMCache from source ==="
+# LMCache prebuilt wheels are built against PyTorch <=2.8.0 and fail with PyTorch 2.10+
+# (undefined symbol: c10::cuda::c10_cuda_check_implementation).
+# Build from source AFTER vLLM so c_ops.so compiles against the installed PyTorch.
+# Ref: https://docs.lmcache.ai/getting_started/installation.html#install-latest-lmcache-from-source
+if [ "$DEVICE" = "cuda" ] && [[ "$CUDA_VERSION_MAJOR" == "12" ]] && [ "$ARCH" = "amd64" ]; then
+    git clone --depth 1 --branch v${LMCACHE_REF} https://github.com/LMCache/LMCache.git ${INSTALLATION_DIR}/lmcache
+    cd ${INSTALLATION_DIR}/lmcache
+    uv pip install -r requirements/build.txt
+    # Get torch lib dir and embed it as RPATH so c_ops.so finds torch libs at runtime
+    TORCH_LIB=$(python3 -c "import torch, os; print(os.path.dirname(torch.__file__) + '/lib')")
+    # Build from source with --no-build-isolation (uses installed torch) + RPATH for runtime linking
+    TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0;10.0+PTX" LDFLAGS="-Wl,-rpath,${TORCH_LIB}" \
+        uv pip install --no-build-isolation --no-cache .
+    # Verify c_ops.so was compiled (cannot import at build time without GPU/CUDA driver)
+    # cd to neutral dir so Python finds installed lmcache, not the source checkout
+    cd /tmp
+    LMCACHE_DIR=$(python3 -c "import lmcache, os; print(os.path.dirname(lmcache.__file__))")
+    if ls "${LMCACHE_DIR}"/c_ops*.so > /dev/null 2>&1; then
+        echo "✓ lmcache c_ops.so verified: $(ls ${LMCACHE_DIR}/c_ops*.so | head -1 | xargs basename)"
+    else
+        echo "ERROR: c_ops.so not found in ${LMCACHE_DIR} - CUDA extension was not compiled"
+        exit 1
+    fi
+    rm -rf ${INSTALLATION_DIR}/lmcache
+    echo "✓ LMCache ${LMCACHE_REF} installed from source"
+elif [ "$DEVICE" = "xpu" ] && [ "$ARCH" = "amd64" ]; then
+    uv pip install lmcache==${LMCACHE_REF}
+    echo "✓ LMCache ${LMCACHE_REF} installed from PyPI (XPU)"
+else
+    echo "⚠ Skipping LMCache (ARM64 or CUDA 13 not supported)"
+fi
 echo "\n=== Installing vLLM-Omni ==="
 if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
    # Save original vllm entrypoint before vllm-omni overwrites it

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,11 +50,11 @@ trtllm =[
 vllm = [
    "uvloop",
    "nixl[cu12]<=0.10.1",
-    "vllm[flashinfer,runai]==0.16.0",
+    "vllm[flashinfer,runai]==0.17.1",
-    # vllm-omni 0.16.0rc1 is not on PyPI; installed from source in container builds
+    # vllm-omni 0.16.0 is now on PyPI; install only future rc builds from source in container builds
    # (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
    # not include vllm-omni — install it separately from source if needed.
-    # "vllm-omni==0.16.0rc1",
+    "vllm-omni==0.16.0",
    "blake3>=1.0.0,<2.0.0",
 ]

--- a/tests/frontend/common.py
+++ b/tests/frontend/common.py
@@ -2,6 +2,7 @@
 #  SPDX-License-Identifier: Apache-2.0
 import importlib
+import importlib.util
 def check_module_available(module_name: str) -> bool:

--- a/tests/frontend/test_prepost.py
+++ b/tests/frontend/test_prepost.py
@@ -1553,7 +1553,6 @@ def sampling_params():
        prompt_logprobs=None,
        skip_special_tokens=False,
        spaces_between_special_tokens=True,
-        truncate_prompt_tokens=None,
    )

--- a/tests/frontend/test_prepost_mistral.py
+++ b/tests/frontend/test_prepost_mistral.py
@@ -61,7 +61,15 @@ THINK_END_TOKEN_ID = 8
 class _InnerTokenizer:
    """Mimics the inner ``tokenizer.tokenizer`` accessed by MistralReasoningParser."""
+    def get_special_token(self, token):
+        # vLLM 0.17.0 renamed get_control_token -> get_special_token
+        return self._token_lookup(token)
    def get_control_token(self, token):
+        # kept for older vLLM compat
+        return self._token_lookup(token)
+    def _token_lookup(self, token):
        return {
            SpecialTokens.begin_think: THINK_START_TOKEN_ID,
            SpecialTokens.end_think: THINK_END_TOKEN_ID,
@@ -537,7 +545,6 @@ def sampling_params():
        prompt_logprobs=None,
        skip_special_tokens=True,
        spaces_between_special_tokens=True,
-        truncate_prompt_tokens=None,
    )

--- a/tests/kvbm_integration/test_kvbm_vllm_integration.py
+++ b/tests/kvbm_integration/test_kvbm_vllm_integration.py
@@ -272,7 +272,6 @@ def test_request_interface():
        prompt_token_ids=[1, 2, 3],
        sampling_params=SamplingParams(max_tokens=10),
        pooling_params=None,
-        eos_token_id=100,
        lora_request=LoRARequest(
            lora_name="test_lora", lora_int_id=1, lora_path="test_path"
        ),