Unverified Commit dba69e0f authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

chore(deps): bump vLLM 0.16.0 → 0.17.1 (#7170)


Signed-off-by: default avatarTzu-Ling <tzulingk@nvidia.com>
parent 17db1b6a
...@@ -26,7 +26,7 @@ repos: ...@@ -26,7 +26,7 @@ repos:
- id: black - id: black
types_or: [python, cython] types_or: [python, cython]
- repo: https://github.com/PyCQA/flake8 - repo: https://github.com/PyCQA/flake8
rev: 5.0.4 rev: 7.3.0 # 5.0.4 crashes on Python 3.12+ (ast.Str removed)
hooks: hooks:
- id: flake8 - id: flake8
args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
......
...@@ -17,6 +17,7 @@ from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig ...@@ -17,6 +17,7 @@ from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.tasks import GENERATION_TASKS
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager from vllm.tool_parsers import ToolParser, ToolParserManager
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
...@@ -85,6 +86,21 @@ class VllmProcessor: ...@@ -85,6 +86,21 @@ class VllmProcessor:
self.tool_parser_class = tool_parser_class self.tool_parser_class = tool_parser_class
self.reasoning_parser_class = reasoning_parser_class self.reasoning_parser_class = reasoning_parser_class
def _get_eos_token_ids(self) -> list[int]:
"""Return EOS token ids using tokenizer metadata.
vLLM 0.17.0 removed EngineCoreRequest.eos_token_id, so Dynamo can no
longer read EOS ids from the preprocessed request object.
"""
eos_token_ids = getattr(self.tokenizer, "eos_token_ids", None)
if eos_token_ids is not None and not isinstance(eos_token_ids, int):
return list(eos_token_ids)
eos_token_id = getattr(self.tokenizer, "eos_token_id", None)
if eos_token_id is None:
return []
return [eos_token_id]
# Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but # Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but
# it has a lot of fields. # it has a lot of fields.
# request: dynamo.NVCreateChatCompletionRequest # request: dynamo.NVCreateChatCompletionRequest
...@@ -130,7 +146,11 @@ class VllmProcessor: ...@@ -130,7 +146,11 @@ class VllmProcessor:
max_tokens=max_tokens, max_tokens=max_tokens,
) )
# generation_config.json # generation_config.json
# Skip eos_token_id: vLLM 0.17.0 made SamplingParams.eos_token_id a
# read-only property; eos tokens are handled via eos_token_ids below.
for k, v in self.input_processor.generation_config_fields.items(): for k, v in self.input_processor.generation_config_fields.items():
if k == "eos_token_id":
continue
if hasattr(sampling_params, k): if hasattr(sampling_params, k):
setattr(sampling_params, k, v) setattr(sampling_params, k, v)
...@@ -174,17 +194,13 @@ class VllmProcessor: ...@@ -174,17 +194,13 @@ class VllmProcessor:
request_id, request_id,
prompt_inputs, prompt_inputs,
sampling_params, sampling_params,
# arrival_time: float | None = None, GENERATION_TASKS, # vLLM 0.17.0: required supported_tasks arg
# lora_request: LoRARequest | None = None,
# tokenization_kwargs: dict[str, Any] | None = None,
# trace_headers: Mapping[str, str] | None = None,
# priority: int = 0,
# data_parallel_rank: int | None = None,
) )
InputProcessor.assign_request_id(vllm_preproc) InputProcessor.assign_request_id(vllm_preproc)
# Processed: EngineCoreRequest(request_id='a2b76a85cd65e151', prompt_token_ids=[3838, 374, 279, 6722, 315, 28649, 25510, 30], mm_features=None, sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None), pooling_params=None, eos_token_id=151645, arrival_time=1769036937.9417946, lora_request=None, cache_salt=None, data_parallel_rank=None, prompt_embeds=None, client_index=0, current_wave=0, priority=0, trace_headers=None) # vLLM 0.17.0 removed EngineCoreRequest.eos_token_id. Dynamo now uses
# tokenizer metadata for EOS ids when constructing the router payload.
# Convert to a Python object that has fields that match our PreprocessedRequest # Convert to a Python object that has fields that match our PreprocessedRequest
sp = vllm_preproc.sampling_params sp = vllm_preproc.sampling_params
...@@ -229,11 +245,7 @@ class VllmProcessor: ...@@ -229,11 +245,7 @@ class VllmProcessor:
"prompt_logprobs": sp.prompt_logprobs, "prompt_logprobs": sp.prompt_logprobs,
"skip_special_tokens": sp.skip_special_tokens, "skip_special_tokens": sp.skip_special_tokens,
}, },
"eos_token_ids": ( "eos_token_ids": self._get_eos_token_ids(),
[vllm_preproc.eos_token_id]
if vllm_preproc.eos_token_id is not None
else []
),
"annotations": [], "annotations": [],
} }
......
...@@ -13,7 +13,6 @@ import uvloop ...@@ -13,7 +13,6 @@ import uvloop
from prometheus_client import REGISTRY, CollectorRegistry, multiprocess from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.distributed.kv_events import ZmqEventPublisher from vllm.distributed.kv_events import ZmqEventPublisher
from vllm.entrypoints.cli.serve import run_headless
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
...@@ -91,6 +90,10 @@ def run_dynamo_headless(config: Config) -> None: ...@@ -91,6 +90,10 @@ def run_dynamo_headless(config: Config) -> None:
Secondary nodes spawn vLLM workers only — no engine core, no scheduler, Secondary nodes spawn vLLM workers only — no engine core, no scheduler,
no Dynamo endpoints. Bypasses DistributedRuntime entirely (no NATS/etcd). no Dynamo endpoints. Bypasses DistributedRuntime entirely (no NATS/etcd).
""" """
# Keep the upstream CLI import local so tests that only exercise
# build_headless_namespace() do not pull in vLLM's full CLI import graph.
from vllm.entrypoints.cli.serve import run_headless
args = build_headless_namespace(config) args = build_headless_namespace(config)
run_headless(args) run_headless(args)
......
...@@ -14,6 +14,7 @@ pytestmark = [ ...@@ -14,6 +14,7 @@ pytestmark = [
pytest.mark.unit, pytest.mark.unit,
pytest.mark.vllm, pytest.mark.vllm,
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.gpu_0,
] ]
......
...@@ -9,9 +9,18 @@ These tests check that the vLLM KV events classes have the expected fields ...@@ -9,9 +9,18 @@ These tests check that the vLLM KV events classes have the expected fields
that our Rust deserializers depend on. If vLLM changes their API, these tests that our Rust deserializers depend on. If vLLM changes their API, these tests
will fail early, before hitting runtime deserialization errors. will fail early, before hitting runtime deserialization errors.
The Rust code in kv_router/publisher.rs and kv_consolidator/subscriber.rs This test is the early warning for vLLM KV-event wire-format changes.
deserializes vLLM's msgpack-encoded KV events. Since vLLM uses msgspec with
array_like=True, the field ORDER matters - fields are serialized positionally. In the normal case, if this fails, update `lib/kv-router/src/zmq_wire.rs` to
match the new upstream vLLM event shape, then update this test.
That file is Dynamo's compatibility layer for vLLM KV events:
- it decodes vLLM's msgpack `array_like=True` wire format
- it handles field order changes in `BlockStored` / `BlockRemoved` / `EventBatch`
- it translates upstream `extra_keys` into Dynamo's internal `block_mm_infos`
Only touch consolidator files if we explicitly need the consolidator publisher
to preserve and republish a new upstream field.
""" """
import importlib import importlib
...@@ -51,6 +60,7 @@ class TestVllmKvEventsApi: ...@@ -51,6 +60,7 @@ class TestVllmKvEventsApi:
5. lora_id 5. lora_id
6. medium 6. medium
7. lora_name (added in vLLM 0.14.0) 7. lora_name (added in vLLM 0.14.0)
8. extra_keys (added in vLLM 0.17.0)
If vLLM adds/removes/reorders fields, this test will fail. If vLLM adds/removes/reorders fields, this test will fail.
""" """
...@@ -62,6 +72,7 @@ class TestVllmKvEventsApi: ...@@ -62,6 +72,7 @@ class TestVllmKvEventsApi:
"lora_id", "lora_id",
"medium", "medium",
"lora_name", "lora_name",
"extra_keys",
) )
actual_fields = BlockStored.__struct_fields__ actual_fields = BlockStored.__struct_fields__
...@@ -69,9 +80,10 @@ class TestVllmKvEventsApi: ...@@ -69,9 +80,10 @@ class TestVllmKvEventsApi:
f"BlockStored fields changed!\n" f"BlockStored fields changed!\n"
f"Expected: {expected_fields}\n" f"Expected: {expected_fields}\n"
f"Actual: {actual_fields}\n" f"Actual: {actual_fields}\n"
f"If vLLM changed the API, update the Rust deserializers in:\n" f"Required follow-up:\n"
f" - lib/llm/src/kv_router/publisher.rs (RawKvEvent::BlockStored)\n" f" - Update lib/kv-router/src/zmq_wire.rs to match the new BlockStored wire format.\n"
f" - lib/llm/src/block_manager/kv_consolidator/subscriber.rs (VllmRawEvent::BlockStored)" f" - Update this test's expected_fields and msgpack position checks.\n"
f" - If needed, add or update a regression test in lib/llm/src/kv_router/publisher.rs."
) )
def test_block_removed_fields(self): def test_block_removed_fields(self):
...@@ -86,7 +98,9 @@ class TestVllmKvEventsApi: ...@@ -86,7 +98,9 @@ class TestVllmKvEventsApi:
f"BlockRemoved fields changed!\n" f"BlockRemoved fields changed!\n"
f"Expected: {expected_fields}\n" f"Expected: {expected_fields}\n"
f"Actual: {actual_fields}\n" f"Actual: {actual_fields}\n"
f"If vLLM changed the API, update the Rust deserializers." f"Required follow-up:\n"
f" - Update lib/kv-router/src/zmq_wire.rs RawKvEvent::BlockRemoved seq deserializer.\n"
f" - Update this test's expected_fields."
) )
def test_event_batch_fields(self): def test_event_batch_fields(self):
...@@ -101,7 +115,11 @@ class TestVllmKvEventsApi: ...@@ -101,7 +115,11 @@ class TestVllmKvEventsApi:
assert actual_fields == expected_fields, ( assert actual_fields == expected_fields, (
f"EventBatch fields changed!\n" f"EventBatch fields changed!\n"
f"Expected: {expected_fields}\n" f"Expected: {expected_fields}\n"
f"Actual: {actual_fields}" f"Actual: {actual_fields}\n"
f"Required follow-up:\n"
f" - Update lib/kv-router/src/zmq_wire.rs KvEventBatch Deserialize impl.\n"
f" - Update subscriber.rs VllmEventBatch tuple if batch field order changes.\n"
f" - Update this test's expected_fields."
) )
def test_kv_cache_event_uses_array_like(self): def test_kv_cache_event_uses_array_like(self):
...@@ -148,6 +166,7 @@ class TestVllmKvEventsApi: ...@@ -148,6 +166,7 @@ class TestVllmKvEventsApi:
lora_id=None, lora_id=None,
medium="GPU", medium="GPU",
lora_name=None, lora_name=None,
extra_keys=None,
) )
encoded = msgspec.msgpack.encode(event) encoded = msgspec.msgpack.encode(event)
...@@ -159,9 +178,9 @@ class TestVllmKvEventsApi: ...@@ -159,9 +178,9 @@ class TestVllmKvEventsApi:
decoded[0] == "BlockStored" decoded[0] == "BlockStored"
), f"Expected tag 'BlockStored', got {decoded[0]}" ), f"Expected tag 'BlockStored', got {decoded[0]}"
# Verify field count (tag + 7 fields = 8 elements) # Verify field count (tag + 8 fields = 9 elements)
assert len(decoded) == 8, ( assert len(decoded) == 9, (
f"Expected 8 elements (tag + 7 fields), got {len(decoded)}.\n" f"Expected 9 elements (tag + 8 fields), got {len(decoded)}.\n"
f"Decoded: {decoded}\n" f"Decoded: {decoded}\n"
f"If field count changed, update Rust deserializers." f"If field count changed, update Rust deserializers."
) )
...@@ -174,3 +193,4 @@ class TestVllmKvEventsApi: ...@@ -174,3 +193,4 @@ class TestVllmKvEventsApi:
assert decoded[5] is None, f"lora_id at wrong position: {decoded[5]}" assert decoded[5] is None, f"lora_id at wrong position: {decoded[5]}"
assert decoded[6] == "GPU", f"medium at wrong position: {decoded[6]}" assert decoded[6] == "GPU", f"medium at wrong position: {decoded[6]}"
assert decoded[7] is None, f"lora_name at wrong position: {decoded[7]}" assert decoded[7] is None, f"lora_name at wrong position: {decoded[7]}"
assert decoded[8] is None, f"extra_keys at wrong position: {decoded[8]}"
...@@ -37,6 +37,7 @@ pytestmark = [ ...@@ -37,6 +37,7 @@ pytestmark = [
pytest.mark.unit, pytest.mark.unit,
pytest.mark.vllm, pytest.mark.vllm,
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.gpu_0,
] ]
......
...@@ -271,8 +271,9 @@ class TestVllmRendererApi: ...@@ -271,8 +271,9 @@ class TestVllmRendererApi:
input_processor.renderer to preprocess_chat_request. input_processor.renderer to preprocess_chat_request.
VllmProcessor iterates input_processor.generation_config_fields. VllmProcessor iterates input_processor.generation_config_fields.
""" """
assert hasattr(InputProcessor, "renderer"), ( init_source = inspect.getsource(InputProcessor.__init__)
"InputProcessor no longer has 'renderer' attribute/property; " assert "self.renderer" in init_source, (
"InputProcessor.__init__ no longer initializes 'renderer'; "
"update preprocess_chat_request call in " "update preprocess_chat_request call in "
"components/src/dynamo/frontend/vllm_processor.py" "components/src/dynamo/frontend/vllm_processor.py"
) )
...@@ -363,7 +364,6 @@ class TestVllmRendererApi: ...@@ -363,7 +364,6 @@ class TestVllmRendererApi:
"mm_features", "mm_features",
"sampling_params", "sampling_params",
"pooling_params", "pooling_params",
"eos_token_id",
"arrival_time", "arrival_time",
"lora_request", "lora_request",
"cache_salt", "cache_salt",
......
...@@ -40,22 +40,22 @@ vllm: ...@@ -40,22 +40,22 @@ vllm:
runtime_image: nvcr.io/nvidia/cuda runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04 base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: 12.9.1-runtime-ubuntu24.04 runtime_image_tag: 12.9.1-runtime-ubuntu24.04
vllm_ref: v0.16.0 vllm_ref: v0.17.1
cuda13.0: cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: nvcr.io/nvidia/cuda runtime_image: nvcr.io/nvidia/cuda
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: 13.0.2-runtime-ubuntu24.04 runtime_image_tag: 13.0.2-runtime-ubuntu24.04
vllm_ref: v0.16.0 vllm_ref: v0.17.1
xpu: xpu:
base_image: intel/deep-learning-essentials base_image: intel/deep-learning-essentials
runtime_image: intel/deep-learning-essentials runtime_image: intel/deep-learning-essentials
base_image_tag: 2025.3.2-0-devel-ubuntu24.04 base_image_tag: 2025.3.2-0-devel-ubuntu24.04
runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04 runtime_image_tag: 2025.3.2-0-devel-ubuntu24.04
vllm_ref: v0.14.0 vllm_ref: v0.14.0
flashinf_ref: v0.6.3 flashinf_ref: v0.6.4
lmcache_ref: 0.3.14 lmcache_ref: 0.4.1
vllm_omni_ref: "v0.16.0rc1" vllm_omni_ref: "v0.16.0"
max_jobs: "10" max_jobs: "10"
enable_media_ffmpeg: "false" enable_media_ffmpeg: "false"
enable_gpu_memory_service: "true" enable_gpu_memory_service: "true"
......
...@@ -4,15 +4,15 @@ ...@@ -4,15 +4,15 @@
# This script installs vLLM and its dependencies from PyPI (release versions only). # This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order: # Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence) # 1. vLLM
# 2. vLLM # 2. LMCache (built from source AFTER vLLM so c_ops.so is compiled against installed PyTorch)
# 3. vLLM-Omni # 3. vLLM-Omni
# 4. DeepGEMM # 4. DeepGEMM
# 5. EP kernels # 5. EP kernels
set -euo pipefail set -euo pipefail
VLLM_VER="0.16.0" VLLM_VER="0.17.1"
VLLM_REF="v${VLLM_VER}" VLLM_REF="v${VLLM_VER}"
DEVICE="cuda" DEVICE="cuda"
...@@ -25,9 +25,9 @@ INSTALLATION_DIR=/tmp ...@@ -25,9 +25,9 @@ INSTALLATION_DIR=/tmp
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
DEEPGEMM_REF="" DEEPGEMM_REF=""
CUDA_VERSION="12.9" CUDA_VERSION="12.9"
FLASHINF_REF="v0.6.3" FLASHINF_REF="v0.6.4"
LMCACHE_REF="0.3.14" LMCACHE_REF="0.4.1"
VLLM_OMNI_REF="v0.16.0rc1" VLLM_OMNI_REF="v0.16.0"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
...@@ -133,30 +133,6 @@ elif [ "$DEVICE" = "xpu" ]; then ...@@ -133,30 +133,6 @@ elif [ "$DEVICE" = "xpu" ]; then
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR" echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR"
fi fi
if [ "$DEVICE" = "cuda" ]; then
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
echo "✓ LMCache ${LMCACHE_REF} installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
else
echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
fi
elif [ "$DEVICE" = "xpu" ]; then
echo " LMCACHE_REF=$LMCACHE_REF "
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
uv pip install lmcache==${LMCACHE_REF}
echo "✓ LMCache ${LMCACHE_REF} installed"
fi
fi
echo "\n=== Cloning vLLM repository ===" echo "\n=== Cloning vLLM repository ==="
# Clone needed for DeepGEMM and EP kernels install scripts # Clone needed for DeepGEMM and EP kernels install scripts
cd $INSTALLATION_DIR cd $INSTALLATION_DIR
...@@ -217,6 +193,40 @@ if [ "$DEVICE" = "cuda" ]; then ...@@ -217,6 +193,40 @@ if [ "$DEVICE" = "cuda" ]; then
fi fi
echo "✓ vLLM installation completed" echo "✓ vLLM installation completed"
echo "\n=== Installing LMCache from source ==="
# LMCache prebuilt wheels are built against PyTorch <=2.8.0 and fail with PyTorch 2.10+
# (undefined symbol: c10::cuda::c10_cuda_check_implementation).
# Build from source AFTER vLLM so c_ops.so compiles against the installed PyTorch.
# Ref: https://docs.lmcache.ai/getting_started/installation.html#install-latest-lmcache-from-source
if [ "$DEVICE" = "cuda" ] && [[ "$CUDA_VERSION_MAJOR" == "12" ]] && [ "$ARCH" = "amd64" ]; then
git clone --depth 1 --branch v${LMCACHE_REF} https://github.com/LMCache/LMCache.git ${INSTALLATION_DIR}/lmcache
cd ${INSTALLATION_DIR}/lmcache
uv pip install -r requirements/build.txt
# Get torch lib dir and embed it as RPATH so c_ops.so finds torch libs at runtime
TORCH_LIB=$(python3 -c "import torch, os; print(os.path.dirname(torch.__file__) + '/lib')")
# Build from source with --no-build-isolation (uses installed torch) + RPATH for runtime linking
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0;10.0+PTX" LDFLAGS="-Wl,-rpath,${TORCH_LIB}" \
uv pip install --no-build-isolation --no-cache .
# Verify c_ops.so was compiled (cannot import at build time without GPU/CUDA driver)
# cd to neutral dir so Python finds installed lmcache, not the source checkout
cd /tmp
LMCACHE_DIR=$(python3 -c "import lmcache, os; print(os.path.dirname(lmcache.__file__))")
if ls "${LMCACHE_DIR}"/c_ops*.so > /dev/null 2>&1; then
echo "✓ lmcache c_ops.so verified: $(ls ${LMCACHE_DIR}/c_ops*.so | head -1 | xargs basename)"
else
echo "ERROR: c_ops.so not found in ${LMCACHE_DIR} - CUDA extension was not compiled"
exit 1
fi
rm -rf ${INSTALLATION_DIR}/lmcache
echo "✓ LMCache ${LMCACHE_REF} installed from source"
elif [ "$DEVICE" = "xpu" ] && [ "$ARCH" = "amd64" ]; then
uv pip install lmcache==${LMCACHE_REF}
echo "✓ LMCache ${LMCACHE_REF} installed from PyPI (XPU)"
else
echo "⚠ Skipping LMCache (ARM64 or CUDA 13 not supported)"
fi
echo "\n=== Installing vLLM-Omni ===" echo "\n=== Installing vLLM-Omni ==="
if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
# Save original vllm entrypoint before vllm-omni overwrites it # Save original vllm entrypoint before vllm-omni overwrites it
......
...@@ -50,11 +50,11 @@ trtllm =[ ...@@ -50,11 +50,11 @@ trtllm =[
vllm = [ vllm = [
"uvloop", "uvloop",
"nixl[cu12]<=0.10.1", "nixl[cu12]<=0.10.1",
"vllm[flashinfer,runai]==0.16.0", "vllm[flashinfer,runai]==0.17.1",
# vllm-omni 0.16.0rc1 is not on PyPI; installed from source in container builds # vllm-omni 0.16.0 is now on PyPI; install only future rc builds from source in container builds
# (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will # (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
# not include vllm-omni — install it separately from source if needed. # not include vllm-omni — install it separately from source if needed.
# "vllm-omni==0.16.0rc1", "vllm-omni==0.16.0",
"blake3>=1.0.0,<2.0.0", "blake3>=1.0.0,<2.0.0",
] ]
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import importlib import importlib
import importlib.util
def check_module_available(module_name: str) -> bool: def check_module_available(module_name: str) -> bool:
......
...@@ -1553,7 +1553,6 @@ def sampling_params(): ...@@ -1553,7 +1553,6 @@ def sampling_params():
prompt_logprobs=None, prompt_logprobs=None,
skip_special_tokens=False, skip_special_tokens=False,
spaces_between_special_tokens=True, spaces_between_special_tokens=True,
truncate_prompt_tokens=None,
) )
......
...@@ -61,7 +61,15 @@ THINK_END_TOKEN_ID = 8 ...@@ -61,7 +61,15 @@ THINK_END_TOKEN_ID = 8
class _InnerTokenizer: class _InnerTokenizer:
"""Mimics the inner ``tokenizer.tokenizer`` accessed by MistralReasoningParser.""" """Mimics the inner ``tokenizer.tokenizer`` accessed by MistralReasoningParser."""
def get_special_token(self, token):
# vLLM 0.17.0 renamed get_control_token -> get_special_token
return self._token_lookup(token)
def get_control_token(self, token): def get_control_token(self, token):
# kept for older vLLM compat
return self._token_lookup(token)
def _token_lookup(self, token):
return { return {
SpecialTokens.begin_think: THINK_START_TOKEN_ID, SpecialTokens.begin_think: THINK_START_TOKEN_ID,
SpecialTokens.end_think: THINK_END_TOKEN_ID, SpecialTokens.end_think: THINK_END_TOKEN_ID,
...@@ -537,7 +545,6 @@ def sampling_params(): ...@@ -537,7 +545,6 @@ def sampling_params():
prompt_logprobs=None, prompt_logprobs=None,
skip_special_tokens=True, skip_special_tokens=True,
spaces_between_special_tokens=True, spaces_between_special_tokens=True,
truncate_prompt_tokens=None,
) )
......
...@@ -272,7 +272,6 @@ def test_request_interface(): ...@@ -272,7 +272,6 @@ def test_request_interface():
prompt_token_ids=[1, 2, 3], prompt_token_ids=[1, 2, 3],
sampling_params=SamplingParams(max_tokens=10), sampling_params=SamplingParams(max_tokens=10),
pooling_params=None, pooling_params=None,
eos_token_id=100,
lora_request=LoRARequest( lora_request=LoRARequest(
lora_name="test_lora", lora_int_id=1, lora_path="test_path" lora_name="test_lora", lora_int_id=1, lora_path="test_path"
), ),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment