Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
f79b3d62
Unverified
Commit
f79b3d62
authored
Apr 01, 2026
by
Richard Huo
Committed by
GitHub
Apr 01, 2026
Browse files
fix: remove forcing the kv block size as 16 in vLLM backend (#7690)
parent
be5b8a58
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
40 additions
and
13 deletions
+40
-13
components/src/dynamo/vllm/args.py
components/src/dynamo/vllm/args.py
+1
-2
components/src/dynamo/vllm/main.py
components/src/dynamo/vllm/main.py
+4
-0
tests/router/test_router_e2e_with_vllm.py
tests/router/test_router_e2e_with_vllm.py
+35
-11
No files found.
components/src/dynamo/vllm/args.py
View file @
f79b3d62
...
...
@@ -227,9 +227,8 @@ def update_engine_config_with_dynamo(
engine_config
.
enable_prefix_caching
=
True
if
getattr
(
engine_config
,
"block_size"
,
None
)
is
None
:
engine_config
.
block_size
=
16
logger
.
debug
(
f
"Setting reasonable default of
{
engine
_
config
.
block_size
}
for block_size
"
"block_size is not set in
engine
config.
vLLM engine block_size will be determined at runtime based on the model and attention backend.
"
)
if
_uses_nixl_connector
(
engine_config
):
...
...
components/src/dynamo/vllm/main.py
View file @
f79b3d62
...
...
@@ -576,6 +576,10 @@ def setup_vllm_engine(
logger
.
info
(
f
"VllmWorker for
{
config
.
served_model_name
}
has been initialized"
)
# update block_size in vllm_config based on final engine cache info for later use
runtime_values
=
get_engine_cache_info
(
engine_client
)
vllm_config
.
cache_config
.
block_size
=
runtime_values
[
"block_size"
]
return
(
engine_client
,
vllm_config
,
...
...
tests/router/test_router_e2e_with_vllm.py
View file @
f79b3d62
...
...
@@ -46,6 +46,13 @@ VLLM_ARGS: Dict[str, Any] = {
"enforce_eager"
:
True
,
# Disable CUDA graphs for faster startup & lower memory
}
VLLM_ARGS_NO_BLOCK_SIZE
:
Dict
[
str
,
Any
]
=
{
"model"
:
MODEL_NAME
,
"gpu_memory_utilization"
:
0.4
,
# Limit VRAM allocation per worker
"max_model_len"
:
1024
,
# Limit context length to reduce KV cache size
"enforce_eager"
:
True
,
# Disable CUDA graphs for faster startup & lower memory
}
class
VLLMProcess
(
ManagedEngineProcessMixin
):
"""Manages vLLM workers using dynamo.vllm (HTTP API + KV events).
...
...
@@ -73,7 +80,6 @@ class VLLMProcess(ManagedEngineProcessMixin):
Args:
request: pytest request fixture for log directory
vllm_args: Configuration dict with keys:
- block_size: KV cache block size (default: 16)
- model: Model name/path (default: TinyLlama-1.1B)
- gpu_memory_utilization: Fraction of GPU memory to allocate (optional)
- num_gpu_blocks_override: Cap on number of KV cache blocks (optional)
...
...
@@ -110,7 +116,6 @@ class VLLMProcess(ManagedEngineProcessMixin):
if
vllm_args
is
None
:
vllm_args
=
{}
block_size
=
vllm_args
.
get
(
"block_size"
,
BLOCK_SIZE
)
model
=
vllm_args
.
get
(
"model"
,
MODEL_NAME
)
gpu_memory_utilization
=
vllm_args
.
get
(
"gpu_memory_utilization"
)
num_gpu_blocks_override
=
vllm_args
.
get
(
"num_gpu_blocks_override"
)
...
...
@@ -144,15 +149,10 @@ class VLLMProcess(ManagedEngineProcessMixin):
# No DP; worker sees one GPU
gpu_device
=
str
(
worker_idx
)
command
=
[
"python3"
,
"-m"
,
"dynamo.vllm"
,
"--model"
,
model
,
"--block-size"
,
str
(
block_size
),
]
command
=
[
"python3"
,
"-m"
,
"dynamo.vllm"
,
"--model"
,
model
]
if
"block_size"
in
vllm_args
:
command
.
extend
([
"--block-size"
,
str
(
vllm_args
[
"block_size"
])])
# Disable CUDA graphs for faster startup & lower memory
if
enforce_eager
:
...
...
@@ -277,6 +277,30 @@ def test_vllm_kv_router_basic(
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
timeout
(
150
)
# ~3x average (~43s/test), rounded up
@
pytest
.
mark
.
parametrize
(
"request_plane"
,
[
"tcp"
],
indirect
=
True
)
def
test_vllm_kv_router_without_block_size_specified_in_vllm_args
(
request
,
runtime_services_dynamic_ports
,
predownload_models
,
set_ucx_tls_no_mm
,
request_plane
,
):
run_basic_router_test
(
engine_process_cls
=
VLLMProcess
,
engine_args_name
=
"vllm_args"
,
engine_args
=
VLLM_ARGS_NO_BLOCK_SIZE
,
num_workers
=
2
,
single_gpu
=
True
,
request
=
request
,
request_plane
=
request_plane
,
block_size
=
BLOCK_SIZE
,
model_name
=
MODEL_NAME
,
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
timeout
(
150
)
# ~3x average (~43s/test), rounded up
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment