Unverified Commit dba69e0f authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

chore(deps): bump vLLM 0.16.0 → 0.17.1 (#7170)


Signed-off-by: default avatarTzu-Ling <tzulingk@nvidia.com>
parent 17db1b6a
......@@ -26,7 +26,7 @@ repos:
- id: black
types_or: [python, cython]
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
rev: 7.3.0 # 5.0.4 crashes on Python 3.12+ (ast.Str removed)
hooks:
- id: flake8
args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
......
......@@ -17,6 +17,7 @@ from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig
from vllm.inputs.data import TokensPrompt
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.tasks import GENERATION_TASKS
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
......@@ -85,6 +86,21 @@ class VllmProcessor:
self.tool_parser_class = tool_parser_class
self.reasoning_parser_class = reasoning_parser_class
def _get_eos_token_ids(self) -> list[int]:
"""Return EOS token ids using tokenizer metadata.
vLLM 0.17.0 removed EngineCoreRequest.eos_token_id, so Dynamo can no
longer read EOS ids from the preprocessed request object.
"""
eos_token_ids = getattr(self.tokenizer, "eos_token_ids", None)
if eos_token_ids is not None and not isinstance(eos_token_ids, int):
return list(eos_token_ids)
eos_token_id = getattr(self.tokenizer, "eos_token_id", None)
if eos_token_id is None:
return []
return [eos_token_id]
# Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but
# it has a lot of fields.
# request: dynamo.NVCreateChatCompletionRequest
......@@ -130,7 +146,11 @@ class VllmProcessor:
max_tokens=max_tokens,
)
# generation_config.json
# Skip eos_token_id: vLLM 0.17.0 made SamplingParams.eos_token_id a
# read-only property; eos tokens are handled via eos_token_ids below.
for k, v in self.input_processor.generation_config_fields.items():
if k == "eos_token_id":
continue
if hasattr(sampling_params, k):
setattr(sampling_params, k, v)
......@@ -174,17 +194,13 @@ class VllmProcessor:
request_id,
prompt_inputs,
sampling_params,
# arrival_time: float | None = None,
# lora_request: LoRARequest | None = None,
# tokenization_kwargs: dict[str, Any] | None = None,
# trace_headers: Mapping[str, str] | None = None,
# priority: int = 0,
# data_parallel_rank: int | None = None,
GENERATION_TASKS, # vLLM 0.17.0: required supported_tasks arg
)
InputProcessor.assign_request_id(vllm_preproc)
# Processed: EngineCoreRequest(request_id='a2b76a85cd65e151', prompt_token_ids=[3838, 374, 279, 6722, 315, 28649, 25510, 30], mm_features=None, sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None), pooling_params=None, eos_token_id=151645, arrival_time=1769036937.9417946, lora_request=None, cache_salt=None, data_parallel_rank=None, prompt_embeds=None, client_index=0, current_wave=0, priority=0, trace_headers=None)
# vLLM 0.17.0 removed EngineCoreRequest.eos_token_id. Dynamo now uses
# tokenizer metadata for EOS ids when constructing the router payload.
# Convert to a Python object that has fields that match our PreprocessedRequest
sp = vllm_preproc.sampling_params
......@@ -229,11 +245,7 @@ class VllmProcessor:
"prompt_logprobs": sp.prompt_logprobs,
"skip_special_tokens": sp.skip_special_tokens,
},
"eos_token_ids": (
[vllm_preproc.eos_token_id]
if vllm_preproc.eos_token_id is not None
else []
),
"eos_token_ids": self._get_eos_token_ids(),
"annotations": [],
}
......
......@@ -13,7 +13,6 @@ import uvloop
from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
from vllm.config import VllmConfig
from vllm.distributed.kv_events import ZmqEventPublisher
from vllm.entrypoints.cli.serve import run_headless
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
......@@ -91,6 +90,10 @@ def run_dynamo_headless(config: Config) -> None:
Secondary nodes spawn vLLM workers only — no engine core, no scheduler,
no Dynamo endpoints. Bypasses DistributedRuntime entirely (no NATS/etcd).
"""
# Keep the upstream CLI import local so tests that only exercise
# build_headless_namespace() do not pull in vLLM's full CLI import graph.
from vllm.entrypoints.cli.serve import run_headless
args = build_headless_namespace(config)
run_headless(args)
......
......@@ -14,6 +14,7 @@ pytestmark = [
pytest.mark.unit,
pytest.mark.vllm,
pytest.mark.pre_merge,
pytest.mark.gpu_0,
]
......
......@@ -9,9 +9,18 @@ These tests check that the vLLM KV events classes have the expected fields
that our Rust deserializers depend on. If vLLM changes their API, these tests
will fail early, before hitting runtime deserialization errors.
The Rust code in kv_router/publisher.rs and kv_consolidator/subscriber.rs
deserializes vLLM's msgpack-encoded KV events. Since vLLM uses msgspec with
array_like=True, the field ORDER matters - fields are serialized positionally.
This test is the early warning for vLLM KV-event wire-format changes.
In the normal case, if this fails, update `lib/kv-router/src/zmq_wire.rs` to
match the new upstream vLLM event shape, then update this test.
That file is Dynamo's compatibility layer for vLLM KV events:
- it decodes vLLM's msgpack `array_like=True` wire format
- it handles field order changes in `BlockStored` / `BlockRemoved` / `EventBatch`
- it translates upstream `extra_keys` into Dynamo's internal `block_mm_infos`
Only touch consolidator files if we explicitly need the consolidator publisher
to preserve and republish a new upstream field.
"""
import importlib
......@@ -51,6 +60,7 @@ class TestVllmKvEventsApi:
5. lora_id
6. medium
7. lora_name (added in vLLM 0.14.0)
8. extra_keys (added in vLLM 0.17.0)
If vLLM adds/removes/reorders fields, this test will fail.
"""
......@@ -62,6 +72,7 @@ class TestVllmKvEventsApi:
"lora_id",
"medium",
"lora_name",
"extra_keys",
)
actual_fields = BlockStored.__struct_fields__
......@@ -69,9 +80,10 @@ class TestVllmKvEventsApi:
f"BlockStored fields changed!\n"
f"Expected: {expected_fields}\n"
f"Actual: {actual_fields}\n"
f"If vLLM changed the API, update the Rust deserializers in:\n"
f" - lib/llm/src/kv_router/publisher.rs (RawKvEvent::BlockStored)\n"
f" - lib/llm/src/block_manager/kv_consolidator/subscriber.rs (VllmRawEvent::BlockStored)"
f"Required follow-up:\n"
f" - Update lib/kv-router/src/zmq_wire.rs to match the new BlockStored wire format.\n"
f" - Update this test's expected_fields and msgpack position checks.\n"
f" - If needed, add or update a regression test in lib/llm/src/kv_router/publisher.rs."
)
def test_block_removed_fields(self):
......@@ -86,7 +98,9 @@ class TestVllmKvEventsApi:
f"BlockRemoved fields changed!\n"
f"Expected: {expected_fields}\n"
f"Actual: {actual_fields}\n"
f"If vLLM changed the API, update the Rust deserializers."
f"Required follow-up:\n"
f" - Update lib/kv-router/src/zmq_wire.rs RawKvEvent::BlockRemoved seq deserializer.\n"
f" - Update this test's expected_fields."
)
def test_event_batch_fields(self):
......@@ -101,7 +115,11 @@ class TestVllmKvEventsApi:
assert actual_fields == expected_fields, (
f"EventBatch fields changed!\n"
f"Expected: {expected_fields}\n"
f"Actual: {actual_fields}"
f"Actual: {actual_fields}\n"
f"Required follow-up:\n"
f" - Update lib/kv-router/src/zmq_wire.rs KvEventBatch Deserialize impl.\n"
f" - Update subscriber.rs VllmEventBatch tuple if batch field order changes.\n"
f" - Update this test's expected_fields."
)
def test_kv_cache_event_uses_array_like(self):
......@@ -148,6 +166,7 @@ class TestVllmKvEventsApi:
lora_id=None,
medium="GPU",
lora_name=None,
extra_keys=None,
)
encoded = msgspec.msgpack.encode(event)
......@@ -159,9 +178,9 @@ class TestVllmKvEventsApi:
decoded[0] == "BlockStored"
), f"Expected tag 'BlockStored', got {decoded[0]}"
# Verify field count (tag + 7 fields = 8 elements)
assert len(decoded) == 8, (
f"Expected 8 elements (tag + 7 fields), got {len(decoded)}.\n"
# Verify field count (tag + 8 fields = 9 elements)
assert len(decoded) == 9, (
f"Expected 9 elements (tag + 8 fields), got {len(decoded)}.\n"
f"Decoded: {decoded}\n"
f"If field count changed, update Rust deserializers."
)
......@@ -174,3 +193,4 @@ class TestVllmKvEventsApi:
assert decoded[5] is None, f"lora_id at wrong position: {decoded[5]}"
assert decoded[6] == "GPU", f"medium at wrong position: {decoded[6]}"
assert decoded[7] is None, f"lora_name at wrong position: {decoded[7]}"
assert decoded[8] is None, f"extra_keys at wrong position: {decoded[8]}"
......@@ -37,6 +37,7 @@ pytestmark = [
pytest.mark.unit,
pytest.mark.vllm,
pytest.mark.pre_merge,
pytest.mark.gpu_0,
]
......
......@@ -271,8 +271,9 @@ class TestVllmRendererApi:
input_processor.renderer to preprocess_chat_request.
VllmProcessor iterates input_processor.generation_config_fields.
"""
assert hasattr(InputProcessor, "renderer"), (
"InputProcessor no longer has 'renderer' attribute/property; "
init_source = inspect.getsource(InputProcessor.__init__)
assert "self.renderer" in init_source, (
"InputProcessor.__init__ no longer initializes 'renderer'; "
"update preprocess_chat_request call in "
"components/src/dynamo/frontend/vllm_processor.py"
)
......@@ -363,7 +364,6 @@ class TestVllmRendererApi:
"mm_features",
"sampling_params",
"pooling_params",
"eos_token_id",
"arrival_time",
"lora_request",
"cache_salt",
......
......@@ -40,22 +40,22 @@ vllm:
runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: 12.9.1-runtime-ubuntu24.04
vllm_ref: v0.16.0
vllm_ref: v0.17.1
cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: 13.0.2-runtime-ubuntu24.04
vllm_ref: v0.16.0
vllm_ref: v0.17.1
xpu:
base_image: intel/deep-learning-essentials
runtime_image: intel/deep-learning-essentials
base_image_tag: 2025.3.2-0-devel-ubuntu24.04
runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04
vllm_ref: v0.14.0
flashinf_ref: v0.6.3
lmcache_ref: 0.3.14
vllm_omni_ref: "v0.16.0rc1"
flashinf_ref: v0.6.4
lmcache_ref: 0.4.1
vllm_omni_ref: "v0.16.0"
max_jobs: "10"
enable_media_ffmpeg: "false"
enable_gpu_memory_service: "true"
......
......@@ -4,15 +4,15 @@
# This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM
# 1. vLLM
# 2. LMCache (built from source AFTER vLLM so c_ops.so is compiled against installed PyTorch)
# 3. vLLM-Omni
# 4. DeepGEMM
# 5. EP kernels
set -euo pipefail
VLLM_VER="0.16.0"
VLLM_VER="0.17.1"
VLLM_REF="v${VLLM_VER}"
DEVICE="cuda"
......@@ -25,9 +25,9 @@ INSTALLATION_DIR=/tmp
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
DEEPGEMM_REF=""
CUDA_VERSION="12.9"
FLASHINF_REF="v0.6.3"
LMCACHE_REF="0.3.14"
VLLM_OMNI_REF="v0.16.0rc1"
FLASHINF_REF="v0.6.4"
LMCACHE_REF="0.4.1"
VLLM_OMNI_REF="v0.16.0"
while [[ $# -gt 0 ]]; do
case $1 in
......@@ -133,30 +133,6 @@ elif [ "$DEVICE" = "xpu" ]; then
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR"
fi
if [ "$DEVICE" = "cuda" ]; then
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
echo "✓ LMCache ${LMCACHE_REF} installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
else
echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
fi
elif [ "$DEVICE" = "xpu" ]; then
echo " LMCACHE_REF=$LMCACHE_REF "
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
uv pip install lmcache==${LMCACHE_REF}
echo "✓ LMCache ${LMCACHE_REF} installed"
fi
fi
echo "\n=== Cloning vLLM repository ==="
# Clone needed for DeepGEMM and EP kernels install scripts
cd $INSTALLATION_DIR
......@@ -217,6 +193,40 @@ if [ "$DEVICE" = "cuda" ]; then
fi
echo "✓ vLLM installation completed"
echo "\n=== Installing LMCache from source ==="
# LMCache prebuilt wheels are built against PyTorch <=2.8.0 and fail with PyTorch 2.10+
# (undefined symbol: c10::cuda::c10_cuda_check_implementation).
# Build from source AFTER vLLM so c_ops.so compiles against the installed PyTorch.
# Ref: https://docs.lmcache.ai/getting_started/installation.html#install-latest-lmcache-from-source
if [ "$DEVICE" = "cuda" ] && [[ "$CUDA_VERSION_MAJOR" == "12" ]] && [ "$ARCH" = "amd64" ]; then
git clone --depth 1 --branch v${LMCACHE_REF} https://github.com/LMCache/LMCache.git ${INSTALLATION_DIR}/lmcache
cd ${INSTALLATION_DIR}/lmcache
uv pip install -r requirements/build.txt
# Get torch lib dir and embed it as RPATH so c_ops.so finds torch libs at runtime
TORCH_LIB=$(python3 -c "import torch, os; print(os.path.dirname(torch.__file__) + '/lib')")
# Build from source with --no-build-isolation (uses installed torch) + RPATH for runtime linking
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0;10.0+PTX" LDFLAGS="-Wl,-rpath,${TORCH_LIB}" \
uv pip install --no-build-isolation --no-cache .
# Verify c_ops.so was compiled (cannot import at build time without GPU/CUDA driver)
# cd to neutral dir so Python finds installed lmcache, not the source checkout
cd /tmp
LMCACHE_DIR=$(python3 -c "import lmcache, os; print(os.path.dirname(lmcache.__file__))")
if ls "${LMCACHE_DIR}"/c_ops*.so > /dev/null 2>&1; then
echo "✓ lmcache c_ops.so verified: $(ls ${LMCACHE_DIR}/c_ops*.so | head -1 | xargs basename)"
else
echo "ERROR: c_ops.so not found in ${LMCACHE_DIR} - CUDA extension was not compiled"
exit 1
fi
rm -rf ${INSTALLATION_DIR}/lmcache
echo "✓ LMCache ${LMCACHE_REF} installed from source"
elif [ "$DEVICE" = "xpu" ] && [ "$ARCH" = "amd64" ]; then
uv pip install lmcache==${LMCACHE_REF}
echo "✓ LMCache ${LMCACHE_REF} installed from PyPI (XPU)"
else
echo "⚠ Skipping LMCache (ARM64 or CUDA 13 not supported)"
fi
echo "\n=== Installing vLLM-Omni ==="
if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
# Save original vllm entrypoint before vllm-omni overwrites it
......
......@@ -50,11 +50,11 @@ trtllm =[
vllm = [
"uvloop",
"nixl[cu12]<=0.10.1",
"vllm[flashinfer,runai]==0.16.0",
# vllm-omni 0.16.0rc1 is not on PyPI; installed from source in container builds
"vllm[flashinfer,runai]==0.17.1",
# vllm-omni 0.16.0 is now on PyPI; install only future rc builds from source in container builds
# (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
# not include vllm-omni — install it separately from source if needed.
# "vllm-omni==0.16.0rc1",
"vllm-omni==0.16.0",
"blake3>=1.0.0,<2.0.0",
]
......
......@@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
import importlib
import importlib.util
def check_module_available(module_name: str) -> bool:
......
......@@ -1553,7 +1553,6 @@ def sampling_params():
prompt_logprobs=None,
skip_special_tokens=False,
spaces_between_special_tokens=True,
truncate_prompt_tokens=None,
)
......
......@@ -61,7 +61,15 @@ THINK_END_TOKEN_ID = 8
class _InnerTokenizer:
"""Mimics the inner ``tokenizer.tokenizer`` accessed by MistralReasoningParser."""
def get_special_token(self, token):
# vLLM 0.17.0 renamed get_control_token -> get_special_token
return self._token_lookup(token)
def get_control_token(self, token):
# kept for older vLLM compat
return self._token_lookup(token)
def _token_lookup(self, token):
return {
SpecialTokens.begin_think: THINK_START_TOKEN_ID,
SpecialTokens.end_think: THINK_END_TOKEN_ID,
......@@ -537,7 +545,6 @@ def sampling_params():
prompt_logprobs=None,
skip_special_tokens=True,
spaces_between_special_tokens=True,
truncate_prompt_tokens=None,
)
......
......@@ -272,7 +272,6 @@ def test_request_interface():
prompt_token_ids=[1, 2, 3],
sampling_params=SamplingParams(max_tokens=10),
pooling_params=None,
eos_token_id=100,
lora_request=LoRARequest(
lora_name="test_lora", lora_int_id=1, lora_path="test_path"
),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment