Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
dba69e0f
Unverified
Commit
dba69e0f
authored
Mar 12, 2026
by
Tzu-Ling Kan
Committed by
GitHub
Mar 12, 2026
Browse files
chore(deps): bump vLLM 0.16.0 → 0.17.1 (#7170)
Signed-off-by:
Tzu-Ling
<
tzulingk@nvidia.com
>
parent
17db1b6a
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
122 additions
and
69 deletions
+122
-69
.pre-commit-config.yaml
.pre-commit-config.yaml
+1
-1
components/src/dynamo/frontend/vllm_processor.py
components/src/dynamo/frontend/vllm_processor.py
+24
-12
components/src/dynamo/vllm/main.py
components/src/dynamo/vllm/main.py
+4
-1
components/src/dynamo/vllm/tests/test_vllm_engine_monitor_stats.py
...s/src/dynamo/vllm/tests/test_vllm_engine_monitor_stats.py
+1
-0
components/src/dynamo/vllm/tests/test_vllm_kv_events_api.py
components/src/dynamo/vllm/tests/test_vllm_kv_events_api.py
+31
-11
components/src/dynamo/vllm/tests/test_vllm_logging.py
components/src/dynamo/vllm/tests/test_vllm_logging.py
+1
-0
components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
+3
-3
container/context.yaml
container/context.yaml
+5
-5
container/deps/vllm/install_vllm.sh
container/deps/vllm/install_vllm.sh
+40
-30
pyproject.toml
pyproject.toml
+3
-3
tests/frontend/common.py
tests/frontend/common.py
+1
-0
tests/frontend/test_prepost.py
tests/frontend/test_prepost.py
+0
-1
tests/frontend/test_prepost_mistral.py
tests/frontend/test_prepost_mistral.py
+8
-1
tests/kvbm_integration/test_kvbm_vllm_integration.py
tests/kvbm_integration/test_kvbm_vllm_integration.py
+0
-1
No files found.
.pre-commit-config.yaml
View file @
dba69e0f
...
@@ -26,7 +26,7 @@ repos:
...
@@ -26,7 +26,7 @@ repos:
-
id
:
black
-
id
:
black
types_or
:
[
python
,
cython
]
types_or
:
[
python
,
cython
]
-
repo
:
https://github.com/PyCQA/flake8
-
repo
:
https://github.com/PyCQA/flake8
rev
:
5.0.4
rev
:
7.3.0
# 5.0.4 crashes on Python 3.12+ (ast.Str removed)
hooks
:
hooks
:
-
id
:
flake8
-
id
:
flake8
args
:
[
--max-line-length=88
,
--select=C
,
E
,
F
,
W
,
B
,
B950
,
--extend-ignore = E203
,
E501
]
args
:
[
--max-line-length=88
,
--select=C
,
E
,
F
,
W
,
B
,
B950
,
--extend-ignore = E203
,
E501
]
...
...
components/src/dynamo/frontend/vllm_processor.py
View file @
dba69e0f
...
@@ -17,6 +17,7 @@ from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig
...
@@ -17,6 +17,7 @@ from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig
from
vllm.inputs.data
import
TokensPrompt
from
vllm.inputs.data
import
TokensPrompt
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.sampling_params
import
RequestOutputKind
,
SamplingParams
from
vllm.sampling_params
import
RequestOutputKind
,
SamplingParams
from
vllm.tasks
import
GENERATION_TASKS
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers
import
ToolParser
,
ToolParserManager
from
vllm.tool_parsers
import
ToolParser
,
ToolParserManager
from
vllm.v1.engine
import
EngineCoreOutput
,
EngineCoreRequest
,
FinishReason
from
vllm.v1.engine
import
EngineCoreOutput
,
EngineCoreRequest
,
FinishReason
...
@@ -85,6 +86,21 @@ class VllmProcessor:
...
@@ -85,6 +86,21 @@ class VllmProcessor:
self
.
tool_parser_class
=
tool_parser_class
self
.
tool_parser_class
=
tool_parser_class
self
.
reasoning_parser_class
=
reasoning_parser_class
self
.
reasoning_parser_class
=
reasoning_parser_class
def
_get_eos_token_ids
(
self
)
->
list
[
int
]:
"""Return EOS token ids using tokenizer metadata.
vLLM 0.17.0 removed EngineCoreRequest.eos_token_id, so Dynamo can no
longer read EOS ids from the preprocessed request object.
"""
eos_token_ids
=
getattr
(
self
.
tokenizer
,
"eos_token_ids"
,
None
)
if
eos_token_ids
is
not
None
and
not
isinstance
(
eos_token_ids
,
int
):
return
list
(
eos_token_ids
)
eos_token_id
=
getattr
(
self
.
tokenizer
,
"eos_token_id"
,
None
)
if
eos_token_id
is
None
:
return
[]
return
[
eos_token_id
]
# Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but
# Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but
# it has a lot of fields.
# it has a lot of fields.
# request: dynamo.NVCreateChatCompletionRequest
# request: dynamo.NVCreateChatCompletionRequest
...
@@ -130,7 +146,11 @@ class VllmProcessor:
...
@@ -130,7 +146,11 @@ class VllmProcessor:
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
)
)
# generation_config.json
# generation_config.json
# Skip eos_token_id: vLLM 0.17.0 made SamplingParams.eos_token_id a
# read-only property; eos tokens are handled via eos_token_ids below.
for
k
,
v
in
self
.
input_processor
.
generation_config_fields
.
items
():
for
k
,
v
in
self
.
input_processor
.
generation_config_fields
.
items
():
if
k
==
"eos_token_id"
:
continue
if
hasattr
(
sampling_params
,
k
):
if
hasattr
(
sampling_params
,
k
):
setattr
(
sampling_params
,
k
,
v
)
setattr
(
sampling_params
,
k
,
v
)
...
@@ -174,17 +194,13 @@ class VllmProcessor:
...
@@ -174,17 +194,13 @@ class VllmProcessor:
request_id
,
request_id
,
prompt_inputs
,
prompt_inputs
,
sampling_params
,
sampling_params
,
# arrival_time: float | None = None,
GENERATION_TASKS
,
# vLLM 0.17.0: required supported_tasks arg
# lora_request: LoRARequest | None = None,
# tokenization_kwargs: dict[str, Any] | None = None,
# trace_headers: Mapping[str, str] | None = None,
# priority: int = 0,
# data_parallel_rank: int | None = None,
)
)
InputProcessor
.
assign_request_id
(
vllm_preproc
)
InputProcessor
.
assign_request_id
(
vllm_preproc
)
# Processed: EngineCoreRequest(request_id='a2b76a85cd65e151', prompt_token_ids=[3838, 374, 279, 6722, 315, 28649, 25510, 30], mm_features=None, sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None), pooling_params=None, eos_token_id=151645, arrival_time=1769036937.9417946, lora_request=None, cache_salt=None, data_parallel_rank=None, prompt_embeds=None, client_index=0, current_wave=0, priority=0, trace_headers=None)
# vLLM 0.17.0 removed EngineCoreRequest.eos_token_id. Dynamo now uses
# tokenizer metadata for EOS ids when constructing the router payload.
# Convert to a Python object that has fields that match our PreprocessedRequest
# Convert to a Python object that has fields that match our PreprocessedRequest
sp
=
vllm_preproc
.
sampling_params
sp
=
vllm_preproc
.
sampling_params
...
@@ -229,11 +245,7 @@ class VllmProcessor:
...
@@ -229,11 +245,7 @@ class VllmProcessor:
"prompt_logprobs"
:
sp
.
prompt_logprobs
,
"prompt_logprobs"
:
sp
.
prompt_logprobs
,
"skip_special_tokens"
:
sp
.
skip_special_tokens
,
"skip_special_tokens"
:
sp
.
skip_special_tokens
,
},
},
"eos_token_ids"
:
(
"eos_token_ids"
:
self
.
_get_eos_token_ids
(),
[
vllm_preproc
.
eos_token_id
]
if
vllm_preproc
.
eos_token_id
is
not
None
else
[]
),
"annotations"
:
[],
"annotations"
:
[],
}
}
...
...
components/src/dynamo/vllm/main.py
View file @
dba69e0f
...
@@ -13,7 +13,6 @@ import uvloop
...
@@ -13,7 +13,6 @@ import uvloop
from
prometheus_client
import
REGISTRY
,
CollectorRegistry
,
multiprocess
from
prometheus_client
import
REGISTRY
,
CollectorRegistry
,
multiprocess
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_events
import
ZmqEventPublisher
from
vllm.distributed.kv_events
import
ZmqEventPublisher
from
vllm.entrypoints.cli.serve
import
run_headless
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.metrics.prometheus
import
setup_multiprocess_prometheus
from
vllm.v1.metrics.prometheus
import
setup_multiprocess_prometheus
...
@@ -91,6 +90,10 @@ def run_dynamo_headless(config: Config) -> None:
...
@@ -91,6 +90,10 @@ def run_dynamo_headless(config: Config) -> None:
Secondary nodes spawn vLLM workers only — no engine core, no scheduler,
Secondary nodes spawn vLLM workers only — no engine core, no scheduler,
no Dynamo endpoints. Bypasses DistributedRuntime entirely (no NATS/etcd).
no Dynamo endpoints. Bypasses DistributedRuntime entirely (no NATS/etcd).
"""
"""
# Keep the upstream CLI import local so tests that only exercise
# build_headless_namespace() do not pull in vLLM's full CLI import graph.
from
vllm.entrypoints.cli.serve
import
run_headless
args
=
build_headless_namespace
(
config
)
args
=
build_headless_namespace
(
config
)
run_headless
(
args
)
run_headless
(
args
)
...
...
components/src/dynamo/vllm/tests/test_vllm_engine_monitor_stats.py
View file @
dba69e0f
...
@@ -14,6 +14,7 @@ pytestmark = [
...
@@ -14,6 +14,7 @@ pytestmark = [
pytest
.
mark
.
unit
,
pytest
.
mark
.
unit
,
pytest
.
mark
.
vllm
,
pytest
.
mark
.
vllm
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
gpu_0
,
]
]
...
...
components/src/dynamo/vllm/tests/test_vllm_kv_events_api.py
View file @
dba69e0f
...
@@ -9,9 +9,18 @@ These tests check that the vLLM KV events classes have the expected fields
...
@@ -9,9 +9,18 @@ These tests check that the vLLM KV events classes have the expected fields
that our Rust deserializers depend on. If vLLM changes their API, these tests
that our Rust deserializers depend on. If vLLM changes their API, these tests
will fail early, before hitting runtime deserialization errors.
will fail early, before hitting runtime deserialization errors.
The Rust code in kv_router/publisher.rs and kv_consolidator/subscriber.rs
This test is the early warning for vLLM KV-event wire-format changes.
deserializes vLLM's msgpack-encoded KV events. Since vLLM uses msgspec with
array_like=True, the field ORDER matters - fields are serialized positionally.
In the normal case, if this fails, update `lib/kv-router/src/zmq_wire.rs` to
match the new upstream vLLM event shape, then update this test.
That file is Dynamo's compatibility layer for vLLM KV events:
- it decodes vLLM's msgpack `array_like=True` wire format
- it handles field order changes in `BlockStored` / `BlockRemoved` / `EventBatch`
- it translates upstream `extra_keys` into Dynamo's internal `block_mm_infos`
Only touch consolidator files if we explicitly need the consolidator publisher
to preserve and republish a new upstream field.
"""
"""
import
importlib
import
importlib
...
@@ -51,6 +60,7 @@ class TestVllmKvEventsApi:
...
@@ -51,6 +60,7 @@ class TestVllmKvEventsApi:
5. lora_id
5. lora_id
6. medium
6. medium
7. lora_name (added in vLLM 0.14.0)
7. lora_name (added in vLLM 0.14.0)
8. extra_keys (added in vLLM 0.17.0)
If vLLM adds/removes/reorders fields, this test will fail.
If vLLM adds/removes/reorders fields, this test will fail.
"""
"""
...
@@ -62,6 +72,7 @@ class TestVllmKvEventsApi:
...
@@ -62,6 +72,7 @@ class TestVllmKvEventsApi:
"lora_id"
,
"lora_id"
,
"medium"
,
"medium"
,
"lora_name"
,
"lora_name"
,
"extra_keys"
,
)
)
actual_fields
=
BlockStored
.
__struct_fields__
actual_fields
=
BlockStored
.
__struct_fields__
...
@@ -69,9 +80,10 @@ class TestVllmKvEventsApi:
...
@@ -69,9 +80,10 @@ class TestVllmKvEventsApi:
f
"BlockStored fields changed!
\n
"
f
"BlockStored fields changed!
\n
"
f
"Expected:
{
expected_fields
}
\n
"
f
"Expected:
{
expected_fields
}
\n
"
f
"Actual:
{
actual_fields
}
\n
"
f
"Actual:
{
actual_fields
}
\n
"
f
"If vLLM changed the API, update the Rust deserializers in:
\n
"
f
"Required follow-up:
\n
"
f
" - lib/llm/src/kv_router/publisher.rs (RawKvEvent::BlockStored)
\n
"
f
" - Update lib/kv-router/src/zmq_wire.rs to match the new BlockStored wire format.
\n
"
f
" - lib/llm/src/block_manager/kv_consolidator/subscriber.rs (VllmRawEvent::BlockStored)"
f
" - Update this test's expected_fields and msgpack position checks.
\n
"
f
" - If needed, add or update a regression test in lib/llm/src/kv_router/publisher.rs."
)
)
def
test_block_removed_fields
(
self
):
def
test_block_removed_fields
(
self
):
...
@@ -86,7 +98,9 @@ class TestVllmKvEventsApi:
...
@@ -86,7 +98,9 @@ class TestVllmKvEventsApi:
f
"BlockRemoved fields changed!
\n
"
f
"BlockRemoved fields changed!
\n
"
f
"Expected:
{
expected_fields
}
\n
"
f
"Expected:
{
expected_fields
}
\n
"
f
"Actual:
{
actual_fields
}
\n
"
f
"Actual:
{
actual_fields
}
\n
"
f
"If vLLM changed the API, update the Rust deserializers."
f
"Required follow-up:
\n
"
f
" - Update lib/kv-router/src/zmq_wire.rs RawKvEvent::BlockRemoved seq deserializer.
\n
"
f
" - Update this test's expected_fields."
)
)
def
test_event_batch_fields
(
self
):
def
test_event_batch_fields
(
self
):
...
@@ -101,7 +115,11 @@ class TestVllmKvEventsApi:
...
@@ -101,7 +115,11 @@ class TestVllmKvEventsApi:
assert
actual_fields
==
expected_fields
,
(
assert
actual_fields
==
expected_fields
,
(
f
"EventBatch fields changed!
\n
"
f
"EventBatch fields changed!
\n
"
f
"Expected:
{
expected_fields
}
\n
"
f
"Expected:
{
expected_fields
}
\n
"
f
"Actual:
{
actual_fields
}
"
f
"Actual:
{
actual_fields
}
\n
"
f
"Required follow-up:
\n
"
f
" - Update lib/kv-router/src/zmq_wire.rs KvEventBatch Deserialize impl.
\n
"
f
" - Update subscriber.rs VllmEventBatch tuple if batch field order changes.
\n
"
f
" - Update this test's expected_fields."
)
)
def
test_kv_cache_event_uses_array_like
(
self
):
def
test_kv_cache_event_uses_array_like
(
self
):
...
@@ -148,6 +166,7 @@ class TestVllmKvEventsApi:
...
@@ -148,6 +166,7 @@ class TestVllmKvEventsApi:
lora_id
=
None
,
lora_id
=
None
,
medium
=
"GPU"
,
medium
=
"GPU"
,
lora_name
=
None
,
lora_name
=
None
,
extra_keys
=
None
,
)
)
encoded
=
msgspec
.
msgpack
.
encode
(
event
)
encoded
=
msgspec
.
msgpack
.
encode
(
event
)
...
@@ -159,9 +178,9 @@ class TestVllmKvEventsApi:
...
@@ -159,9 +178,9 @@ class TestVllmKvEventsApi:
decoded
[
0
]
==
"BlockStored"
decoded
[
0
]
==
"BlockStored"
),
f
"Expected tag 'BlockStored', got
{
decoded
[
0
]
}
"
),
f
"Expected tag 'BlockStored', got
{
decoded
[
0
]
}
"
# Verify field count (tag +
7
fields =
8
elements)
# Verify field count (tag +
8
fields =
9
elements)
assert
len
(
decoded
)
==
8
,
(
assert
len
(
decoded
)
==
9
,
(
f
"Expected
8
elements (tag +
7
fields), got
{
len
(
decoded
)
}
.
\n
"
f
"Expected
9
elements (tag +
8
fields), got
{
len
(
decoded
)
}
.
\n
"
f
"Decoded:
{
decoded
}
\n
"
f
"Decoded:
{
decoded
}
\n
"
f
"If field count changed, update Rust deserializers."
f
"If field count changed, update Rust deserializers."
)
)
...
@@ -174,3 +193,4 @@ class TestVllmKvEventsApi:
...
@@ -174,3 +193,4 @@ class TestVllmKvEventsApi:
assert
decoded
[
5
]
is
None
,
f
"lora_id at wrong position:
{
decoded
[
5
]
}
"
assert
decoded
[
5
]
is
None
,
f
"lora_id at wrong position:
{
decoded
[
5
]
}
"
assert
decoded
[
6
]
==
"GPU"
,
f
"medium at wrong position:
{
decoded
[
6
]
}
"
assert
decoded
[
6
]
==
"GPU"
,
f
"medium at wrong position:
{
decoded
[
6
]
}
"
assert
decoded
[
7
]
is
None
,
f
"lora_name at wrong position:
{
decoded
[
7
]
}
"
assert
decoded
[
7
]
is
None
,
f
"lora_name at wrong position:
{
decoded
[
7
]
}
"
assert
decoded
[
8
]
is
None
,
f
"extra_keys at wrong position:
{
decoded
[
8
]
}
"
components/src/dynamo/vllm/tests/test_vllm_logging.py
View file @
dba69e0f
...
@@ -37,6 +37,7 @@ pytestmark = [
...
@@ -37,6 +37,7 @@ pytestmark = [
pytest
.
mark
.
unit
,
pytest
.
mark
.
unit
,
pytest
.
mark
.
vllm
,
pytest
.
mark
.
vllm
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
gpu_0
,
]
]
...
...
components/src/dynamo/vllm/tests/test_vllm_renderer_api.py
View file @
dba69e0f
...
@@ -271,8 +271,9 @@ class TestVllmRendererApi:
...
@@ -271,8 +271,9 @@ class TestVllmRendererApi:
input_processor.renderer to preprocess_chat_request.
input_processor.renderer to preprocess_chat_request.
VllmProcessor iterates input_processor.generation_config_fields.
VllmProcessor iterates input_processor.generation_config_fields.
"""
"""
assert
hasattr
(
InputProcessor
,
"renderer"
),
(
init_source
=
inspect
.
getsource
(
InputProcessor
.
__init__
)
"InputProcessor no longer has 'renderer' attribute/property; "
assert
"self.renderer"
in
init_source
,
(
"InputProcessor.__init__ no longer initializes 'renderer'; "
"update preprocess_chat_request call in "
"update preprocess_chat_request call in "
"components/src/dynamo/frontend/vllm_processor.py"
"components/src/dynamo/frontend/vllm_processor.py"
)
)
...
@@ -363,7 +364,6 @@ class TestVllmRendererApi:
...
@@ -363,7 +364,6 @@ class TestVllmRendererApi:
"mm_features"
,
"mm_features"
,
"sampling_params"
,
"sampling_params"
,
"pooling_params"
,
"pooling_params"
,
"eos_token_id"
,
"arrival_time"
,
"arrival_time"
,
"lora_request"
,
"lora_request"
,
"cache_salt"
,
"cache_salt"
,
...
...
container/context.yaml
View file @
dba69e0f
...
@@ -40,22 +40,22 @@ vllm:
...
@@ -40,22 +40,22 @@ vllm:
runtime_image
:
nvcr.io/nvidia/cuda
runtime_image
:
nvcr.io/nvidia/cuda
base_image_tag
:
25.06-cuda12.9-devel-ubuntu24.04
base_image_tag
:
25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag
:
12.9.1-runtime-ubuntu24.04
runtime_image_tag
:
12.9.1-runtime-ubuntu24.04
vllm_ref
:
v0.1
6.0
vllm_ref
:
v0.1
7.1
cuda13.0
:
cuda13.0
:
base_image
:
nvcr.io/nvidia/cuda-dl-base
base_image
:
nvcr.io/nvidia/cuda-dl-base
runtime_image
:
nvcr.io/nvidia/cuda
runtime_image
:
nvcr.io/nvidia/cuda
base_image_tag
:
25.11-cuda13.0-devel-ubuntu24.04
base_image_tag
:
25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag
:
13.0.2-runtime-ubuntu24.04
runtime_image_tag
:
13.0.2-runtime-ubuntu24.04
vllm_ref
:
v0.1
6.0
vllm_ref
:
v0.1
7.1
xpu
:
xpu
:
base_image
:
intel/deep-learning-essentials
base_image
:
intel/deep-learning-essentials
runtime_image
:
intel/deep-learning-essentials
runtime_image
:
intel/deep-learning-essentials
base_image_tag
:
2025.3.2-0-devel-ubuntu24.04
base_image_tag
:
2025.3.2-0-devel-ubuntu24.04
runtime_image_tag
:
2025.3.2-0-devel-ubuntu24.04
runtime_image_tag
:
2025.3.2-0-devel-ubuntu24.04
vllm_ref
:
v0.14.0
vllm_ref
:
v0.14.0
flashinf_ref
:
v0.6.
3
flashinf_ref
:
v0.6.
4
lmcache_ref
:
0.
3
.1
4
lmcache_ref
:
0.
4
.1
vllm_omni_ref
:
"
v0.16.0
rc1
"
vllm_omni_ref
:
"
v0.16.0"
max_jobs
:
"
10"
max_jobs
:
"
10"
enable_media_ffmpeg
:
"
false"
enable_media_ffmpeg
:
"
false"
enable_gpu_memory_service
:
"
true"
enable_gpu_memory_service
:
"
true"
...
...
container/deps/vllm/install_vllm.sh
View file @
dba69e0f
...
@@ -4,15 +4,15 @@
...
@@ -4,15 +4,15 @@
# This script installs vLLM and its dependencies from PyPI (release versions only).
# This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order:
# Installation order:
# 1. LM
Cache (installed first so vLLM's dependencies take precedence)
# 1.
vL
LM
# 2.
vL
LM
# 2. LM
Cache (built from source AFTER vLLM so c_ops.so is compiled against installed PyTorch)
# 3. vLLM-Omni
# 3. vLLM-Omni
# 4. DeepGEMM
# 4. DeepGEMM
# 5. EP kernels
# 5. EP kernels
set
-euo
pipefail
set
-euo
pipefail
VLLM_VER
=
"0.1
6.0
"
VLLM_VER
=
"0.1
7.1
"
VLLM_REF
=
"v
${
VLLM_VER
}
"
VLLM_REF
=
"v
${
VLLM_VER
}
"
DEVICE
=
"cuda"
DEVICE
=
"cuda"
...
@@ -25,9 +25,9 @@ INSTALLATION_DIR=/tmp
...
@@ -25,9 +25,9 @@ INSTALLATION_DIR=/tmp
TORCH_CUDA_ARCH_LIST
=
"9.0;10.0"
# For EP Kernels -- TODO: check if we need to add 12.0+PTX
TORCH_CUDA_ARCH_LIST
=
"9.0;10.0"
# For EP Kernels -- TODO: check if we need to add 12.0+PTX
DEEPGEMM_REF
=
""
DEEPGEMM_REF
=
""
CUDA_VERSION
=
"12.9"
CUDA_VERSION
=
"12.9"
FLASHINF_REF
=
"v0.6.
3
"
FLASHINF_REF
=
"v0.6.
4
"
LMCACHE_REF
=
"0.
3
.1
4
"
LMCACHE_REF
=
"0.
4
.1"
VLLM_OMNI_REF
=
"v0.16.0
rc1
"
VLLM_OMNI_REF
=
"v0.16.0"
while
[[
$#
-gt
0
]]
;
do
while
[[
$#
-gt
0
]]
;
do
case
$1
in
case
$1
in
...
@@ -133,30 +133,6 @@ elif [ "$DEVICE" = "xpu" ]; then
...
@@ -133,30 +133,6 @@ elif [ "$DEVICE" = "xpu" ]; then
echo
" VLLM_REF=
$VLLM_REF
| ARCH=
$ARCH
| INSTALLATION_DIR=
$INSTALLATION_DIR
"
echo
" VLLM_REF=
$VLLM_REF
| ARCH=
$ARCH
| INSTALLATION_DIR=
$INSTALLATION_DIR
"
fi
fi
if
[
"
$DEVICE
"
=
"cuda"
]
;
then
if
[[
"
$CUDA_VERSION_MAJOR
"
==
"12"
]]
;
then
echo
" FLASHINF_REF=
$FLASHINF_REF
| LMCACHE_REF=
$LMCACHE_REF
| DEEPGEMM_REF=
$DEEPGEMM_REF
"
echo
"
\n
=== Installing LMCache ==="
if
[
"
$ARCH
"
=
"amd64"
]
;
then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip
install
lmcache
==
${
LMCACHE_REF
}
--torch-backend
=
${
TORCH_BACKEND
}
echo
"✓ LMCache
${
LMCACHE_REF
}
installed"
else
echo
"⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
else
echo
" FLASHINF_REF=
$FLASHINF_REF
| LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=
$DEEPGEMM_REF
"
fi
elif
[
"
$DEVICE
"
=
"xpu"
]
;
then
echo
" LMCACHE_REF=
$LMCACHE_REF
"
echo
"
\n
=== Installing LMCache ==="
if
[
"
$ARCH
"
=
"amd64"
]
;
then
uv pip
install
lmcache
==
${
LMCACHE_REF
}
echo
"✓ LMCache
${
LMCACHE_REF
}
installed"
fi
fi
echo
"
\n
=== Cloning vLLM repository ==="
echo
"
\n
=== Cloning vLLM repository ==="
# Clone needed for DeepGEMM and EP kernels install scripts
# Clone needed for DeepGEMM and EP kernels install scripts
cd
$INSTALLATION_DIR
cd
$INSTALLATION_DIR
...
@@ -217,6 +193,40 @@ if [ "$DEVICE" = "cuda" ]; then
...
@@ -217,6 +193,40 @@ if [ "$DEVICE" = "cuda" ]; then
fi
fi
echo
"✓ vLLM installation completed"
echo
"✓ vLLM installation completed"
echo
"
\n
=== Installing LMCache from source ==="
# LMCache prebuilt wheels are built against PyTorch <=2.8.0 and fail with PyTorch 2.10+
# (undefined symbol: c10::cuda::c10_cuda_check_implementation).
# Build from source AFTER vLLM so c_ops.so compiles against the installed PyTorch.
# Ref: https://docs.lmcache.ai/getting_started/installation.html#install-latest-lmcache-from-source
if
[
"
$DEVICE
"
=
"cuda"
]
&&
[[
"
$CUDA_VERSION_MAJOR
"
==
"12"
]]
&&
[
"
$ARCH
"
=
"amd64"
]
;
then
git clone
--depth
1
--branch
v
${
LMCACHE_REF
}
https://github.com/LMCache/LMCache.git
${
INSTALLATION_DIR
}
/lmcache
cd
${
INSTALLATION_DIR
}
/lmcache
uv pip
install
-r
requirements/build.txt
# Get torch lib dir and embed it as RPATH so c_ops.so finds torch libs at runtime
TORCH_LIB
=
$(
python3
-c
"import torch, os; print(os.path.dirname(torch.__file__) + '/lib')"
)
# Build from source with --no-build-isolation (uses installed torch) + RPATH for runtime linking
TORCH_CUDA_ARCH_LIST
=
"8.0;8.6;8.9;9.0;10.0+PTX"
LDFLAGS
=
"-Wl,-rpath,
${
TORCH_LIB
}
"
\
uv pip
install
--no-build-isolation
--no-cache
.
# Verify c_ops.so was compiled (cannot import at build time without GPU/CUDA driver)
# cd to neutral dir so Python finds installed lmcache, not the source checkout
cd
/tmp
LMCACHE_DIR
=
$(
python3
-c
"import lmcache, os; print(os.path.dirname(lmcache.__file__))"
)
if
ls
"
${
LMCACHE_DIR
}
"
/c_ops
*
.so
>
/dev/null 2>&1
;
then
echo
"✓ lmcache c_ops.so verified:
$(
ls
${
LMCACHE_DIR
}
/c_ops
*
.so |
head
-1
| xargs
basename
)
"
else
echo
"ERROR: c_ops.so not found in
${
LMCACHE_DIR
}
- CUDA extension was not compiled"
exit
1
fi
rm
-rf
${
INSTALLATION_DIR
}
/lmcache
echo
"✓ LMCache
${
LMCACHE_REF
}
installed from source"
elif
[
"
$DEVICE
"
=
"xpu"
]
&&
[
"
$ARCH
"
=
"amd64"
]
;
then
uv pip
install
lmcache
==
${
LMCACHE_REF
}
echo
"✓ LMCache
${
LMCACHE_REF
}
installed from PyPI (XPU)"
else
echo
"⚠ Skipping LMCache (ARM64 or CUDA 13 not supported)"
fi
echo
"
\n
=== Installing vLLM-Omni ==="
echo
"
\n
=== Installing vLLM-Omni ==="
if
[
-n
"
$VLLM_OMNI_REF
"
]
&&
[
"
$ARCH
"
=
"amd64"
]
;
then
if
[
-n
"
$VLLM_OMNI_REF
"
]
&&
[
"
$ARCH
"
=
"amd64"
]
;
then
# Save original vllm entrypoint before vllm-omni overwrites it
# Save original vllm entrypoint before vllm-omni overwrites it
...
...
pyproject.toml
View file @
dba69e0f
...
@@ -50,11 +50,11 @@ trtllm =[
...
@@ -50,11 +50,11 @@ trtllm =[
vllm
=
[
vllm
=
[
"uvloop"
,
"uvloop"
,
"nixl[cu12]<=0.10.1"
,
"nixl[cu12]<=0.10.1"
,
"vllm[flashinfer,runai]==0.1
6.0
"
,
"vllm[flashinfer,runai]==0.1
7.1
"
,
# vllm-omni 0.16.0
rc1
is no
t
on PyPI; install
ed
from source in container builds
# vllm-omni 0.16.0 is no
w
on PyPI; install
only future rc builds
from source in container builds
# (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
# (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
# not include vllm-omni — install it separately from source if needed.
# not include vllm-omni — install it separately from source if needed.
#
"vllm-omni==0.16.0
rc1
",
"vllm-omni==0.16.0"
,
"blake3>=1.0.0,<2.0.0"
,
"blake3>=1.0.0,<2.0.0"
,
]
]
...
...
tests/frontend/common.py
View file @
dba69e0f
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
importlib
import
importlib
import
importlib.util
def
check_module_available
(
module_name
:
str
)
->
bool
:
def
check_module_available
(
module_name
:
str
)
->
bool
:
...
...
tests/frontend/test_prepost.py
View file @
dba69e0f
...
@@ -1553,7 +1553,6 @@ def sampling_params():
...
@@ -1553,7 +1553,6 @@ def sampling_params():
prompt_logprobs
=
None
,
prompt_logprobs
=
None
,
skip_special_tokens
=
False
,
skip_special_tokens
=
False
,
spaces_between_special_tokens
=
True
,
spaces_between_special_tokens
=
True
,
truncate_prompt_tokens
=
None
,
)
)
...
...
tests/frontend/test_prepost_mistral.py
View file @
dba69e0f
...
@@ -61,7 +61,15 @@ THINK_END_TOKEN_ID = 8
...
@@ -61,7 +61,15 @@ THINK_END_TOKEN_ID = 8
class
_InnerTokenizer
:
class
_InnerTokenizer
:
"""Mimics the inner ``tokenizer.tokenizer`` accessed by MistralReasoningParser."""
"""Mimics the inner ``tokenizer.tokenizer`` accessed by MistralReasoningParser."""
def
get_special_token
(
self
,
token
):
# vLLM 0.17.0 renamed get_control_token -> get_special_token
return
self
.
_token_lookup
(
token
)
def
get_control_token
(
self
,
token
):
def
get_control_token
(
self
,
token
):
# kept for older vLLM compat
return
self
.
_token_lookup
(
token
)
def
_token_lookup
(
self
,
token
):
return
{
return
{
SpecialTokens
.
begin_think
:
THINK_START_TOKEN_ID
,
SpecialTokens
.
begin_think
:
THINK_START_TOKEN_ID
,
SpecialTokens
.
end_think
:
THINK_END_TOKEN_ID
,
SpecialTokens
.
end_think
:
THINK_END_TOKEN_ID
,
...
@@ -537,7 +545,6 @@ def sampling_params():
...
@@ -537,7 +545,6 @@ def sampling_params():
prompt_logprobs
=
None
,
prompt_logprobs
=
None
,
skip_special_tokens
=
True
,
skip_special_tokens
=
True
,
spaces_between_special_tokens
=
True
,
spaces_between_special_tokens
=
True
,
truncate_prompt_tokens
=
None
,
)
)
...
...
tests/kvbm_integration/test_kvbm_vllm_integration.py
View file @
dba69e0f
...
@@ -272,7 +272,6 @@ def test_request_interface():
...
@@ -272,7 +272,6 @@ def test_request_interface():
prompt_token_ids
=
[
1
,
2
,
3
],
prompt_token_ids
=
[
1
,
2
,
3
],
sampling_params
=
SamplingParams
(
max_tokens
=
10
),
sampling_params
=
SamplingParams
(
max_tokens
=
10
),
pooling_params
=
None
,
pooling_params
=
None
,
eos_token_id
=
100
,
lora_request
=
LoRARequest
(
lora_request
=
LoRARequest
(
lora_name
=
"test_lora"
,
lora_int_id
=
1
,
lora_path
=
"test_path"
lora_name
=
"test_lora"
,
lora_int_id
=
1
,
lora_path
=
"test_path"
),
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment