Unverified Commit 50f1e0e1 authored by Alec's avatar Alec Committed by GitHub
Browse files

chore(deps): bump vLLM to 0.14.0 (#5593)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Signed-off-by: default avatarAlec Flowers <aflowers@nvidia.com>
Signed-off-by: default avatarAlec <35311602+alec-flowers@users.noreply.github.com>
Co-authored-by: default avatarClaude Opus 4.5 <noreply@anthropic.com>
parent 67c868e7
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Unit tests to verify vLLM KV events API compatibility.
These tests check that the vLLM KV events classes have the expected fields
that our Rust deserializers depend on. If vLLM changes their API, these tests
will fail early, before hitting runtime deserialization errors.
The Rust code in kv_router/publisher.rs and kv_consolidator/subscriber.rs
deserializes vLLM's msgpack-encoded KV events. Since vLLM uses msgspec with
array_like=True, the field ORDER matters - fields are serialized positionally.
"""
import importlib
import pytest
# Import vllm first to ensure it's properly loaded before accessing submodules.
# This works around potential issues with pytest's import machinery.
_vllm = importlib.import_module("vllm")
_kv_events = importlib.import_module("vllm.distributed.kv_events")
# Re-export the classes we need for tests
BlockStored = _kv_events.BlockStored
BlockRemoved = _kv_events.BlockRemoved
EventBatch = _kv_events.EventBatch
KVCacheEvent = _kv_events.KVCacheEvent
pytestmark = [
pytest.mark.vllm,
pytest.mark.unit,
]
class TestVllmKvEventsApi:
"""Test vLLM KV events API compatibility."""
def test_block_stored_fields(self):
"""Verify BlockStored has expected fields in expected order.
The Rust deserializer expects these fields in this exact order:
1. block_hashes
2. parent_block_hash
3. token_ids
4. block_size
5. lora_id
6. medium
7. lora_name (added in vLLM 0.14.0)
If vLLM adds/removes/reorders fields, this test will fail.
"""
expected_fields = (
"block_hashes",
"parent_block_hash",
"token_ids",
"block_size",
"lora_id",
"medium",
"lora_name",
)
actual_fields = BlockStored.__struct_fields__
assert actual_fields == expected_fields, (
f"BlockStored fields changed!\n"
f"Expected: {expected_fields}\n"
f"Actual: {actual_fields}\n"
f"If vLLM changed the API, update the Rust deserializers in:\n"
f" - lib/llm/src/kv_router/publisher.rs (RawKvEvent::BlockStored)\n"
f" - lib/llm/src/block_manager/kv_consolidator/subscriber.rs (VllmRawEvent::BlockStored)"
)
def test_block_removed_fields(self):
"""Verify BlockRemoved has expected fields in expected order."""
expected_fields = (
"block_hashes",
"medium",
)
actual_fields = BlockRemoved.__struct_fields__
assert actual_fields == expected_fields, (
f"BlockRemoved fields changed!\n"
f"Expected: {expected_fields}\n"
f"Actual: {actual_fields}\n"
f"If vLLM changed the API, update the Rust deserializers."
)
def test_event_batch_fields(self):
"""Verify EventBatch/KVEventBatch has expected fields."""
expected_fields = (
"ts",
"events",
"data_parallel_rank",
)
actual_fields = EventBatch.__struct_fields__
assert actual_fields == expected_fields, (
f"EventBatch fields changed!\n"
f"Expected: {expected_fields}\n"
f"Actual: {actual_fields}"
)
def test_kv_cache_event_uses_array_like(self):
"""Verify KVCacheEvent uses array_like=True serialization.
Our Rust deserializers expect msgpack arrays, not objects.
If this changes, deserialization will break.
"""
# msgspec structs with array_like=True have this attribute
struct_config = getattr(KVCacheEvent, "__struct_config__", None)
assert struct_config is not None, "KVCacheEvent is not a msgspec Struct"
assert struct_config.array_like is True, (
"KVCacheEvent no longer uses array_like=True! "
"This will break Rust deserialization."
)
def test_kv_cache_event_uses_tag(self):
"""Verify KVCacheEvent uses tag=True for variant identification.
The tag (e.g., 'BlockStored') is the first element in the msgpack array.
"""
struct_config = getattr(KVCacheEvent, "__struct_config__", None)
assert struct_config is not None, "KVCacheEvent is not a msgspec Struct"
# When tag=True is set, struct_config.tag contains the tag string (class name)
# or True. A falsy value (None/False) means no tagging.
assert struct_config.tag, (
"KVCacheEvent no longer uses tag=True! "
"This will break Rust deserialization."
)
def test_block_stored_serialization_format(self):
"""Verify BlockStored serializes to expected msgpack array format.
This is the ultimate test - if the serialized format changes,
Rust deserialization will fail.
"""
import msgspec
event = BlockStored(
block_hashes=[123, 456],
parent_block_hash=789,
token_ids=[1, 2, 3, 4],
block_size=16,
lora_id=None,
medium="GPU",
lora_name=None,
)
encoded = msgspec.msgpack.encode(event)
decoded = msgspec.msgpack.decode(encoded)
# Should be an array with tag as first element
assert isinstance(decoded, list), f"Expected list, got {type(decoded)}"
assert (
decoded[0] == "BlockStored"
), f"Expected tag 'BlockStored', got {decoded[0]}"
# Verify field count (tag + 7 fields = 8 elements)
assert len(decoded) == 8, (
f"Expected 8 elements (tag + 7 fields), got {len(decoded)}.\n"
f"Decoded: {decoded}\n"
f"If field count changed, update Rust deserializers."
)
# Verify field positions
assert decoded[1] == [123, 456], f"block_hashes at wrong position: {decoded[1]}"
assert decoded[2] == 789, f"parent_block_hash at wrong position: {decoded[2]}"
assert decoded[3] == [1, 2, 3, 4], f"token_ids at wrong position: {decoded[3]}"
assert decoded[4] == 16, f"block_size at wrong position: {decoded[4]}"
assert decoded[5] is None, f"lora_id at wrong position: {decoded[5]}"
assert decoded[6] == "GPU", f"medium at wrong position: {decoded[6]}"
assert decoded[7] is None, f"lora_name at wrong position: {decoded[7]}"
......@@ -75,7 +75,7 @@ ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.9"
# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.13.0"
ARG VLLM_REF="v0.14.0"
# FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
ARG FLASHINF_REF="v0.5.3"
......@@ -671,6 +671,8 @@ RUN apt-get update && \
g++ \
# prometheus dependencies
ca-certificates \
# opencv-python-headless (vLLM dependency) requires libxcb for some functions
libxcb1 \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR} && \
rm -rf /var/lib/apt/lists/*
......
......@@ -11,7 +11,7 @@
set -euo pipefail
VLLM_VER="0.13.0"
VLLM_VER="0.14.0"
VLLM_REF="v${VLLM_VER}"
# Basic Configurations
......
......@@ -66,7 +66,7 @@ The following table shows the dependency versions included with each Dynamo rele
| :------------- | :------------- | :--------- | :--------- | :--------------- | :--------- |
| SGLang | 0.5.7 | 0.5.6.post2 | 0.5.3.post4| 0.5.3.post4 | 0.5.3.post4|
| TensorRT-LLM | 1.2.0rc6.post2 | 1.2.0rc6.post1 | 1.2.0rc3 | 1.2.0rc3 | 1.2.0rc2 |
| vLLM | 0.13.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 |
| vLLM | 0.14.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 |
| NIXL | 0.9.0 | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 |
> [!Note]
......
......@@ -169,19 +169,21 @@ class KvConnectorLeader:
req.num_computed_tokens,
)
# In vLLM 0.14.0+, resumed_from_preemption was changed to resumed_req_ids (a set)
resumed_req_ids = scheduler_output.scheduled_cached_reqs.resumed_req_ids
for (
req_id,
resumed_from_preemption,
new_token_ids,
new_block_ids,
num_computed_tokens,
) in zip(
scheduler_output.scheduled_cached_reqs.req_ids,
scheduler_output.scheduled_cached_reqs.resumed_from_preemption,
scheduler_output.scheduled_cached_reqs.new_token_ids,
scheduler_output.scheduled_cached_reqs.new_block_ids,
scheduler_output.scheduled_cached_reqs.num_computed_tokens,
):
resumed_from_preemption = req_id in resumed_req_ids
if new_block_ids is not None:
output.add_cached_request(
request_id=req_id,
......
......@@ -148,6 +148,10 @@ enum VllmRawEvent {
lora_id: Option<i32>,
#[serde(default)]
medium: Option<String>,
#[serde(default)]
#[allow(dead_code)]
// Reserved for future use, needed for vLLM 0.14.0 deserialization
lora_name: Option<String>,
},
#[serde(rename = "BlockRemoved")]
BlockRemoved {
......@@ -277,6 +281,7 @@ fn process_event(
block_size,
lora_id,
medium,
lora_name: _, // Not used yet, lora_id is still used for backwards compat
} => {
let storage_tier = medium
.as_ref()
......
......@@ -634,9 +634,13 @@ enum RawKvEvent {
parent_block_hash: Option<BlockHashValue>,
token_ids: Vec<u32>,
block_size: usize,
/// Deprecated in vLLM 0.14.0: use `lora_name` instead
lora_id: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
medium: Option<String>,
/// LoRA adapter name (added in vLLM 0.14.0, replaces lora_id)
#[serde(default, skip_serializing_if = "Option::is_none")]
lora_name: Option<String>,
/// Multimodal extra info for each block (length should match block_hashes)
#[serde(default, skip_serializing_if = "Option::is_none")]
block_mm_infos: Option<Vec<Option<BlockExtraInfo>>>,
......@@ -685,6 +689,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
let mut block_size: Option<usize> = None;
let mut lora_id: Option<Option<u64>> = None;
let mut medium: Option<Option<String>> = None;
let mut lora_name: Option<Option<String>> = None;
let mut block_mm_infos: Option<Option<Vec<Option<BlockExtraInfo>>>> = None;
while let Some(key) = map.next_key::<String>()? {
......@@ -710,6 +715,9 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
"medium" => {
medium = Some(map.next_value()?);
}
"lora_name" => {
lora_name = Some(map.next_value()?);
}
"block_mm_infos" => {
block_mm_infos = Some(map.next_value()?);
}
......@@ -733,6 +741,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
block_size,
lora_id: lora_id.unwrap_or(None),
medium: medium.unwrap_or(None),
lora_name: lora_name.unwrap_or(None),
block_mm_infos: block_mm_infos.unwrap_or(None),
})
}
......@@ -779,6 +788,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
.ok_or_else(|| de::Error::invalid_length(4, &"missing block_size"))?;
let lora_id: Option<u64> = seq.next_element()?.unwrap_or(None);
let medium: Option<String> = seq.next_element()?.unwrap_or(None);
let lora_name: Option<String> = seq.next_element()?.unwrap_or(None);
let block_mm_infos: Option<Vec<Option<BlockExtraInfo>>> =
seq.next_element()?.unwrap_or(None);
......@@ -791,6 +801,7 @@ impl<'de> Visitor<'de> for RawKvEventVisitor {
block_size,
lora_id,
medium,
lora_name,
block_mm_infos,
})
}
......@@ -1030,6 +1041,7 @@ mod test_event_processing {
block_size: 4,
lora_id: Some(0),
medium: None,
lora_name: None,
block_mm_infos: None,
};
......@@ -1494,6 +1506,7 @@ mod tests_startup_helpers {
block_size: 4,
lora_id: None,
medium: None,
lora_name: None,
block_mm_infos: None,
}];
......
......@@ -56,7 +56,7 @@ trtllm =[
vllm = [
"uvloop",
"nixl[cu12]<=0.9.0",
"vllm[flashinfer,runai]==0.13.0",
"vllm[flashinfer,runai]==0.14.0",
]
sglang = [
......
......@@ -312,6 +312,7 @@ def frontend_server(test_directory, runtime_services):
working_dir=str(test_directory),
display_output=False,
log_dir=str(frontend_log_dir), # Absolute path keeps logs in test directory
terminate_existing=False, # Don't kill nats-server/etcd started by runtime_services
) as frontend_process:
# Get actual log file path from ManagedProcess (it may modify log_dir to use temp directory)
log_file = Path(frontend_process._log_path)
......@@ -742,6 +743,7 @@ class TestConsolidatorRouterE2E:
working_dir=str(test_directory),
display_output=False,
log_dir=str(frontend_log_dir), # Absolute path keeps logs in test directory
terminate_existing=False, # Don't kill nats-server/etcd started by runtime_services
) as _frontend_process:
# Get actual log file path from ManagedProcess
frontend_log = Path(_frontend_process._log_path)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment