chore(deps): bump vLLM to 0.14.0 (#5593)

Signed-off-by: alec-flowers <aflowers@nvidia.com> Signed-off-by: Alec Flowers <aflowers@nvidia.com> Signed-off-by: Alec <35311602+alec-flowers@users.noreply.github.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>

chore(deps): bump vLLM to 0.14.0 (#5593)
Signed-off-by: alec-flowers <aflowers@nvidia.com> Signed-off-by: Alec Flowers <aflowers@nvidia.com> Signed-off-by: Alec <35311602+alec-flowers@users.noreply.github.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
50f1e0e1 · Alec · GitHub · 67c868e7 · 50f1e0e1 · 50f1e0e1
Unverified Commit 50f1e0e1 authored Jan 23, 2026 by Alec Committed by GitHub Jan 24, 2026
9 changed files
--- a/components/src/dynamo/vllm/tests/test_vllm_kv_events_api.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_kv_events_api.py
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Unit tests to verify vLLM KV events API compatibility.
+
+These tests check that the vLLM KV events classes have the expected fields
+that our Rust deserializers depend on. If vLLM changes their API, these tests
+will fail early, before hitting runtime deserialization errors.
+
+The Rust code in kv_router/publisher.rs and kv_consolidator/subscriber.rs
+deserializes vLLM's msgpack-encoded KV events. Since vLLM uses msgspec with
+array_like=True, the field ORDER matters - fields are serialized positionally.
+"""
+
+import importlib
+
+import pytest
+
+# Import vllm first to ensure it's properly loaded before accessing submodules.
+# This works around potential issues with pytest's import machinery.
+_vllm = importlib.import_module("vllm")
+_kv_events = importlib.import_module("vllm.distributed.kv_events")
+
+# Re-export the classes we need for tests
+BlockStored = _kv_events.BlockStored
+BlockRemoved = _kv_events.BlockRemoved
+EventBatch = _kv_events.EventBatch
+KVCacheEvent = _kv_events.KVCacheEvent
+
+pytestmark = [
+    pytest.mark.vllm,
+    pytest.mark.unit,
+]
+
+
+class TestVllmKvEventsApi:
+    """Test vLLM KV events API compatibility."""
+
+    def test_block_stored_fields(self):
+        """Verify BlockStored has expected fields in expected order.
+
+        The Rust deserializer expects these fields in this exact order:
+        1. block_hashes
+        2. parent_block_hash
+        3. token_ids
+        4. block_size
+        5. lora_id
+        6. medium
+        7. lora_name (added in vLLM 0.14.0)
+
+        If vLLM adds/removes/reorders fields, this test will fail.
+        """
+        expected_fields = (
+            "block_hashes",
+            "parent_block_hash",
+            "token_ids",
+            "block_size",
+            "lora_id",
+            "medium",
+            "lora_name",
+        )
+
+        actual_fields = BlockStored.__struct_fields__
+        assert actual_fields == expected_fields, (
+            f"BlockStored fields changed!\n"
+            f"Expected: {expected_fields}\n"
+            f"Actual:   {actual_fields}\n"
+            f"If vLLM changed the API, update the Rust deserializers in:\n"
+            f"  - lib/llm/src/kv_router/publisher.rs (RawKvEvent::BlockStored)\n"
+            f"  - lib/llm/src/block_manager/kv_consolidator/subscriber.rs (VllmRawEvent::BlockStored)"
+        )
+
+    def test_block_removed_fields(self):
+        """Verify BlockRemoved has expected fields in expected order."""
+        expected_fields = (
+            "block_hashes",
+            "medium",
+        )
+
+        actual_fields = BlockRemoved.__struct_fields__
+        assert actual_fields == expected_fields, (
+            f"BlockRemoved fields changed!\n"
+            f"Expected: {expected_fields}\n"
+            f"Actual:   {actual_fields}\n"
+            f"If vLLM changed the API, update the Rust deserializers."
+        )
+
+    def test_event_batch_fields(self):
+        """Verify EventBatch/KVEventBatch has expected fields."""
+        expected_fields = (
+            "ts",
+            "events",
+            "data_parallel_rank",
+        )
+
+        actual_fields = EventBatch.__struct_fields__
+        assert actual_fields == expected_fields, (
+            f"EventBatch fields changed!\n"
+            f"Expected: {expected_fields}\n"
+            f"Actual:   {actual_fields}"
+        )
+
+    def test_kv_cache_event_uses_array_like(self):
+        """Verify KVCacheEvent uses array_like=True serialization.
+
+        Our Rust deserializers expect msgpack arrays, not objects.
+        If this changes, deserialization will break.
+        """
+        # msgspec structs with array_like=True have this attribute
+        struct_config = getattr(KVCacheEvent, "__struct_config__", None)
+        assert struct_config is not None, "KVCacheEvent is not a msgspec Struct"
+        assert struct_config.array_like is True, (
+            "KVCacheEvent no longer uses array_like=True! "
+            "This will break Rust deserialization."
+        )
+
+    def test_kv_cache_event_uses_tag(self):
+        """Verify KVCacheEvent uses tag=True for variant identification.
+
+        The tag (e.g., 'BlockStored') is the first element in the msgpack array.
+        """
+        struct_config = getattr(KVCacheEvent, "__struct_config__", None)
+        assert struct_config is not None, "KVCacheEvent is not a msgspec Struct"
+        # When tag=True is set, struct_config.tag contains the tag string (class name)
+        # or True. A falsy value (None/False) means no tagging.
+        assert struct_config.tag, (
+            "KVCacheEvent no longer uses tag=True! "
+            "This will break Rust deserialization."
+        )
+
+    def test_block_stored_serialization_format(self):
+        """Verify BlockStored serializes to expected msgpack array format.
+
+        This is the ultimate test - if the serialized format changes,
+        Rust deserialization will fail.
+        """
+        import msgspec
+
+        event = BlockStored(
+            block_hashes=[123, 456],
+            parent_block_hash=789,
+            token_ids=[1, 2, 3, 4],
+            block_size=16,
+            lora_id=None,
+            medium="GPU",
+            lora_name=None,
+        )
+
+        encoded = msgspec.msgpack.encode(event)
+        decoded = msgspec.msgpack.decode(encoded)
+
+        # Should be an array with tag as first element
+        assert isinstance(decoded, list), f"Expected list, got {type(decoded)}"
+        assert (
+            decoded[0] == "BlockStored"
+        ), f"Expected tag 'BlockStored', got {decoded[0]}"
+
+        # Verify field count (tag + 7 fields = 8 elements)
+        assert len(decoded) == 8, (
+            f"Expected 8 elements (tag + 7 fields), got {len(decoded)}.\n"
+            f"Decoded: {decoded}\n"
+            f"If field count changed, update Rust deserializers."
+        )
+
+        # Verify field positions
+        assert decoded[1] == [123, 456], f"block_hashes at wrong position: {decoded[1]}"
+        assert decoded[2] == 789, f"parent_block_hash at wrong position: {decoded[2]}"
+        assert decoded[3] == [1, 2, 3, 4], f"token_ids at wrong position: {decoded[3]}"
+        assert decoded[4] == 16, f"block_size at wrong position: {decoded[4]}"
+        assert decoded[5] is None, f"lora_id at wrong position: {decoded[5]}"
+        assert decoded[6] == "GPU", f"medium at wrong position: {decoded[6]}"
+        assert decoded[7] is None, f"lora_name at wrong position: {decoded[7]}"
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -75,7 +75,7 @@ ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04"
 ARG CUDA_VERSION="12.9"

 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_REF="v0.13.0"
+ARG VLLM_REF="v0.14.0"
 # FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
 ARG FLASHINF_REF="v0.5.3"

@@ -671,6 +671,8 @@ RUN apt-get update && \
        g++ \
        # prometheus dependencies
        ca-certificates \
+        # opencv-python-headless (vLLM dependency) requires libxcb for some functions
+        libxcb1 \
        # DeepGemm uses 'cuobjdump' which does not come with CUDA image
        cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR} && \
    rm -rf /var/lib/apt/lists/*

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -11,7 +11,7 @@

 set -euo pipefail

-VLLM_VER="0.13.0"
+VLLM_VER="0.14.0"
 VLLM_REF="v${VLLM_VER}"

 # Basic Configurations

--- a/docs/reference/support-matrix.md
+++ b/docs/reference/support-matrix.md
@@ -66,7 +66,7 @@ The following table shows the dependency versions included with each Dynamo rele
 | :------------- | :------------- | :--------- | :--------- | :--------------- | :--------- |
 | SGLang         | 0.5.7          | 0.5.6.post2 | 0.5.3.post4| 0.5.3.post4      | 0.5.3.post4|
 | TensorRT-LLM   | 1.2.0rc6.post2 | 1.2.0rc6.post1 | 1.2.0rc3   | 1.2.0rc3         | 1.2.0rc2   |
-| vLLM           | 0.13.0         | 0.12.0     | 0.11.0     | 0.11.0           | 0.11.0     |
+| vLLM           | 0.14.0         | 0.12.0     | 0.11.0     | 0.11.0           | 0.11.0     |
 | NIXL           | 0.9.0          | 0.8.0      | 0.8.0      | 0.8.0            | 0.8.0      |

 > [!Note]

--- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_leader.py
+++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_leader.py
@@ -169,19 +169,21 @@ class KvConnectorLeader:
                req.num_computed_tokens,
            )

+        # In vLLM 0.14.0+, resumed_from_preemption was changed to resumed_req_ids (a set)
+        resumed_req_ids = scheduler_output.scheduled_cached_reqs.resumed_req_ids
+
        for (
            req_id,
-            resumed_from_preemption,
            new_token_ids,
            new_block_ids,
            num_computed_tokens,
        ) in zip(
            scheduler_output.scheduled_cached_reqs.req_ids,
-            scheduler_output.scheduled_cached_reqs.resumed_from_preemption,
            scheduler_output.scheduled_cached_reqs.new_token_ids,
            scheduler_output.scheduled_cached_reqs.new_block_ids,
            scheduler_output.scheduled_cached_reqs.num_computed_tokens,
        ):
+            resumed_from_preemption = req_id in resumed_req_ids
            if new_block_ids is not None:
                output.add_cached_request(
                    request_id=req_id,

--- a/lib/llm/src/block_manager/kv_consolidator/subscriber.rs
+++ b/lib/llm/src/block_manager/kv_consolidator/subscriber.rs
@@ -148,6 +148,10 @@ enum VllmRawEvent {
        lora_id: Option<i32>,
        #[serde(default)]
        medium: Option<String>,
+        #[serde(default)]
+        #[allow(dead_code)]
+        // Reserved for future use, needed for vLLM 0.14.0 deserialization
+        lora_name: Option<String>,
    },
    #[serde(rename = "BlockRemoved")]
    BlockRemoved {
@@ -277,6 +281,7 @@ fn process_event(
            block_size,
            lora_id,
            medium,
+            lora_name: _, // Not used yet, lora_id is still used for backwards compat
        } => {
            let storage_tier = medium
                .as_ref()

--- a/lib/llm/src/kv_router/publisher.rs
+++ b/lib/llm/src/kv_router/publisher.rs
@@ -634,9 +634,13 @@ enum RawKvEvent {
        parent_block_hash: Option<BlockHashValue>,
        token_ids: Vec<u32>,
        block_size: usize,
+        /// Deprecated in vLLM 0.14.0: use `lora_name` instead
        lora_id: Option<u64>,
        #[serde(skip_serializing_if = "Option::is_none")]
        medium: Option<String>,
+        /// LoRA adapter name (added in vLLM 0.14.0, replaces lora_id)
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        lora_name: Option<String>,
        /// Multimodal extra info for each block (length should match block_hashes)
        #[serde(default, skip_serializing_if = "Option::is_none")]
        block_mm_infos: Option<Vec<Option<BlockExtraInfo>>>,
@@ -685,6 +689,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
        let mut block_size: Option<usize> = None;
        let mut lora_id: Option<Option<u64>> = None;
        let mut medium: Option<Option<String>> = None;
+        let mut lora_name: Option<Option<String>> = None;
        let mut block_mm_infos: Option<Option<Vec<Option<BlockExtraInfo>>>> = None;

        while let Some(key) = map.next_key::<String>()? {
@@ -710,6 +715,9 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
                "medium" => {
                    medium = Some(map.next_value()?);
                }
+                "lora_name" => {
+                    lora_name = Some(map.next_value()?);
+                }
                "block_mm_infos" => {
                    block_mm_infos = Some(map.next_value()?);
                }
@@ -733,6 +741,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
                    block_size,
                    lora_id: lora_id.unwrap_or(None),
                    medium: medium.unwrap_or(None),
+                    lora_name: lora_name.unwrap_or(None),
                    block_mm_infos: block_mm_infos.unwrap_or(None),
                })
            }
@@ -779,6 +788,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
                    .ok_or_else(|| de::Error::invalid_length(4, &"missing block_size"))?;
                let lora_id: Option<u64> = seq.next_element()?.unwrap_or(None);
                let medium: Option<String> = seq.next_element()?.unwrap_or(None);
+                let lora_name: Option<String> = seq.next_element()?.unwrap_or(None);
                let block_mm_infos: Option<Vec<Option<BlockExtraInfo>>> =
                    seq.next_element()?.unwrap_or(None);

@@ -791,6 +801,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
                    block_size,
                    lora_id,
                    medium,
+                    lora_name,
                    block_mm_infos,
                })
            }
@@ -1030,6 +1041,7 @@ mod test_event_processing {
            block_size: 4,
            lora_id: Some(0),
            medium: None,
+            lora_name: None,
            block_mm_infos: None,
        };

@@ -1494,6 +1506,7 @@ mod tests_startup_helpers {
            block_size: 4,
            lora_id: None,
            medium: None,
+            lora_name: None,
            block_mm_infos: None,
        }];


--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,7 @@ trtllm =[
 vllm = [
    "uvloop",
    "nixl[cu12]<=0.9.0",
-    "vllm[flashinfer,runai]==0.13.0",
+    "vllm[flashinfer,runai]==0.14.0",
 ]

 sglang = [

--- a/tests/kvbm_integration/test_consolidator_router_e2e.py
+++ b/tests/kvbm_integration/test_consolidator_router_e2e.py
@@ -312,6 +312,7 @@ def frontend_server(test_directory, runtime_services):
        working_dir=str(test_directory),
        display_output=False,
        log_dir=str(frontend_log_dir),  # Absolute path keeps logs in test directory
+        terminate_existing=False,  # Don't kill nats-server/etcd started by runtime_services
    ) as frontend_process:
        # Get actual log file path from ManagedProcess (it may modify log_dir to use temp directory)
        log_file = Path(frontend_process._log_path)
@@ -742,6 +743,7 @@ class TestConsolidatorRouterE2E:
            working_dir=str(test_directory),
            display_output=False,
            log_dir=str(frontend_log_dir),  # Absolute path keeps logs in test directory
+            terminate_existing=False,  # Don't kill nats-server/etcd started by runtime_services
        ) as _frontend_process:
            # Get actual log file path from ManagedProcess
            frontend_log = Path(_frontend_process._log_path)