Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
f79b3d62
"lib/bindings/vscode:/vscode.git/clone" did not exist on "3c500ae7a9e8b8bc9dae8b558342eec79dc86106"
Unverified
Commit
f79b3d62
authored
Apr 01, 2026
by
Richard Huo
Committed by
GitHub
Apr 01, 2026
Browse files
fix: remove forcing the kv block size as 16 in vLLM backend (#7690)
parent
be5b8a58
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
40 additions
and
13 deletions
+40
-13
components/src/dynamo/vllm/args.py
components/src/dynamo/vllm/args.py
+1
-2
components/src/dynamo/vllm/main.py
components/src/dynamo/vllm/main.py
+4
-0
tests/router/test_router_e2e_with_vllm.py
tests/router/test_router_e2e_with_vllm.py
+35
-11
No files found.
components/src/dynamo/vllm/args.py
View file @
f79b3d62
...
@@ -227,9 +227,8 @@ def update_engine_config_with_dynamo(
...
@@ -227,9 +227,8 @@ def update_engine_config_with_dynamo(
engine_config
.
enable_prefix_caching
=
True
engine_config
.
enable_prefix_caching
=
True
if
getattr
(
engine_config
,
"block_size"
,
None
)
is
None
:
if
getattr
(
engine_config
,
"block_size"
,
None
)
is
None
:
engine_config
.
block_size
=
16
logger
.
debug
(
logger
.
debug
(
f
"Setting reasonable default of
{
engine
_
config
.
block_size
}
for block_size
"
"block_size is not set in
engine
config.
vLLM engine block_size will be determined at runtime based on the model and attention backend.
"
)
)
if
_uses_nixl_connector
(
engine_config
):
if
_uses_nixl_connector
(
engine_config
):
...
...
components/src/dynamo/vllm/main.py
View file @
f79b3d62
...
@@ -576,6 +576,10 @@ def setup_vllm_engine(
...
@@ -576,6 +576,10 @@ def setup_vllm_engine(
logger
.
info
(
f
"VllmWorker for
{
config
.
served_model_name
}
has been initialized"
)
logger
.
info
(
f
"VllmWorker for
{
config
.
served_model_name
}
has been initialized"
)
# update block_size in vllm_config based on final engine cache info for later use
runtime_values
=
get_engine_cache_info
(
engine_client
)
vllm_config
.
cache_config
.
block_size
=
runtime_values
[
"block_size"
]
return
(
return
(
engine_client
,
engine_client
,
vllm_config
,
vllm_config
,
...
...
tests/router/test_router_e2e_with_vllm.py
View file @
f79b3d62
...
@@ -46,6 +46,13 @@ VLLM_ARGS: Dict[str, Any] = {
...
@@ -46,6 +46,13 @@ VLLM_ARGS: Dict[str, Any] = {
"enforce_eager"
:
True
,
# Disable CUDA graphs for faster startup & lower memory
"enforce_eager"
:
True
,
# Disable CUDA graphs for faster startup & lower memory
}
}
VLLM_ARGS_NO_BLOCK_SIZE
:
Dict
[
str
,
Any
]
=
{
"model"
:
MODEL_NAME
,
"gpu_memory_utilization"
:
0.4
,
# Limit VRAM allocation per worker
"max_model_len"
:
1024
,
# Limit context length to reduce KV cache size
"enforce_eager"
:
True
,
# Disable CUDA graphs for faster startup & lower memory
}
class
VLLMProcess
(
ManagedEngineProcessMixin
):
class
VLLMProcess
(
ManagedEngineProcessMixin
):
"""Manages vLLM workers using dynamo.vllm (HTTP API + KV events).
"""Manages vLLM workers using dynamo.vllm (HTTP API + KV events).
...
@@ -73,7 +80,6 @@ class VLLMProcess(ManagedEngineProcessMixin):
...
@@ -73,7 +80,6 @@ class VLLMProcess(ManagedEngineProcessMixin):
Args:
Args:
request: pytest request fixture for log directory
request: pytest request fixture for log directory
vllm_args: Configuration dict with keys:
vllm_args: Configuration dict with keys:
- block_size: KV cache block size (default: 16)
- model: Model name/path (default: TinyLlama-1.1B)
- model: Model name/path (default: TinyLlama-1.1B)
- gpu_memory_utilization: Fraction of GPU memory to allocate (optional)
- gpu_memory_utilization: Fraction of GPU memory to allocate (optional)
- num_gpu_blocks_override: Cap on number of KV cache blocks (optional)
- num_gpu_blocks_override: Cap on number of KV cache blocks (optional)
...
@@ -110,7 +116,6 @@ class VLLMProcess(ManagedEngineProcessMixin):
...
@@ -110,7 +116,6 @@ class VLLMProcess(ManagedEngineProcessMixin):
if
vllm_args
is
None
:
if
vllm_args
is
None
:
vllm_args
=
{}
vllm_args
=
{}
block_size
=
vllm_args
.
get
(
"block_size"
,
BLOCK_SIZE
)
model
=
vllm_args
.
get
(
"model"
,
MODEL_NAME
)
model
=
vllm_args
.
get
(
"model"
,
MODEL_NAME
)
gpu_memory_utilization
=
vllm_args
.
get
(
"gpu_memory_utilization"
)
gpu_memory_utilization
=
vllm_args
.
get
(
"gpu_memory_utilization"
)
num_gpu_blocks_override
=
vllm_args
.
get
(
"num_gpu_blocks_override"
)
num_gpu_blocks_override
=
vllm_args
.
get
(
"num_gpu_blocks_override"
)
...
@@ -144,15 +149,10 @@ class VLLMProcess(ManagedEngineProcessMixin):
...
@@ -144,15 +149,10 @@ class VLLMProcess(ManagedEngineProcessMixin):
# No DP; worker sees one GPU
# No DP; worker sees one GPU
gpu_device
=
str
(
worker_idx
)
gpu_device
=
str
(
worker_idx
)
command
=
[
command
=
[
"python3"
,
"-m"
,
"dynamo.vllm"
,
"--model"
,
model
]
"python3"
,
"-m"
,
if
"block_size"
in
vllm_args
:
"dynamo.vllm"
,
command
.
extend
([
"--block-size"
,
str
(
vllm_args
[
"block_size"
])])
"--model"
,
model
,
"--block-size"
,
str
(
block_size
),
]
# Disable CUDA graphs for faster startup & lower memory
# Disable CUDA graphs for faster startup & lower memory
if
enforce_eager
:
if
enforce_eager
:
...
@@ -277,6 +277,30 @@ def test_vllm_kv_router_basic(
...
@@ -277,6 +277,30 @@ def test_vllm_kv_router_basic(
)
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
timeout
(
150
)
# ~3x average (~43s/test), rounded up
@
pytest
.
mark
.
parametrize
(
"request_plane"
,
[
"tcp"
],
indirect
=
True
)
def
test_vllm_kv_router_without_block_size_specified_in_vllm_args
(
request
,
runtime_services_dynamic_ports
,
predownload_models
,
set_ucx_tls_no_mm
,
request_plane
,
):
run_basic_router_test
(
engine_process_cls
=
VLLMProcess
,
engine_args_name
=
"vllm_args"
,
engine_args
=
VLLM_ARGS_NO_BLOCK_SIZE
,
num_workers
=
2
,
single_gpu
=
True
,
request
=
request
,
request_plane
=
request_plane
,
block_size
=
BLOCK_SIZE
,
model_name
=
MODEL_NAME
,
)
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
timeout
(
150
)
# ~3x average (~43s/test), rounded up
@
pytest
.
mark
.
timeout
(
150
)
# ~3x average (~43s/test), rounded up
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment