Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1be5a735
Unverified
Commit
1be5a735
authored
Jan 15, 2026
by
Michael Goin
Committed by
GitHub
Jan 15, 2026
Browse files
[UX] Use kv_offloading_backend=native by default (#32421)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
c36ba69b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
28 additions
and
15 deletions
+28
-15
tests/v1/kv_connector/unit/test_config.py
tests/v1/kv_connector/unit/test_config.py
+18
-1
vllm/config/cache.py
vllm/config/cache.py
+5
-5
vllm/config/vllm.py
vllm/config/vllm.py
+4
-6
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-3
No files found.
tests/v1/kv_connector/unit/test_config.py
View file @
1be5a735
...
...
@@ -19,7 +19,8 @@ pytestmark = pytest.mark.cpu_test
(
"lmcache"
,
4.0
,
1
,
1
,
"LMCacheConnectorV1"
,
4.0
),
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
(
"lmcache"
,
8.0
,
2
,
2
,
"LMCacheConnectorV1"
,
2.0
),
(
None
,
None
,
1
,
1
,
None
,
None
),
# When kv_offloading_size is None, offloading is disabled (backend is ignored)
(
"native"
,
None
,
1
,
1
,
None
,
None
),
],
)
def
test_kv_connector
(
...
...
@@ -62,3 +63,19 @@ def test_kv_connector(
assert
kv_connector_extra_config
[
"lmcache.max_local_cpu_size"
]
==
expected_bytes
# Existing config should be replaced
assert
"existing_key"
not
in
kv_connector_extra_config
def
test_kv_offloading_size_only_uses_native_default
():
"""Test that setting only kv_offloading_size enables native offloading."""
vllm_config
=
VllmConfig
(
cache_config
=
CacheConfig
(
kv_offloading_size
=
4.0
,
# kv_offloading_backend not set, should default to "native"
),
)
kv_transfer_config
=
vllm_config
.
kv_transfer_config
kv_connector_extra_config
=
kv_transfer_config
.
kv_connector_extra_config
assert
kv_transfer_config
.
kv_connector
==
"OffloadingConnector"
assert
kv_transfer_config
.
kv_role
==
"kv_both"
assert
kv_connector_extra_config
[
"cpu_bytes_to_use"
]
==
4.0
*
(
1
<<
30
)
vllm/config/cache.py
View file @
1be5a735
...
...
@@ -152,13 +152,13 @@ class CacheConfig:
kv_offloading_size
:
float
|
None
=
None
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
the total buffer size summed across all TP ranks. By default, this is set
to None, which means no KV offloading is enabled. When set wi
th
kv_offloading_backend, vLLM will
enable KV cache offloading to CPU"""
to None, which means no KV offloading is enabled. When set
, vLLM
wi
ll
enable KV cache offloading to CPU
using the kv_offloading_backend.
"""
kv_offloading_backend
:
KVOffloadingBackend
|
None
=
None
kv_offloading_backend
:
KVOffloadingBackend
=
"native"
"""The backend to use for KV cache offloading. Supported backends include
'native' (vLLM native CPU offloading), 'lmcache'
This option must be used
together with
kv_offloading_size."""
'native' (vLLM native CPU offloading), 'lmcache'
.
KV offloading is only activated when
kv_offloading_size
is set
."""
def
compute_hash
(
self
)
->
str
:
"""
...
...
vllm/config/vllm.py
View file @
1be5a735
...
...
@@ -498,17 +498,15 @@ class VllmConfig:
Right now, this function reads the offloading settings from
CacheConfig and configures the KVTransferConfig accordingly.
"""
if
(
kv_offloading_backend
:
=
self
.
cache_config
.
kv_offloading_backend
)
is
None
:
# KV offloading is only activated when kv_offloading_size is set.
if
(
kv_offloading_size
:
=
self
.
cache_config
.
kv_offloading_size
)
is
None
:
return
kv_offloading_backend
=
self
.
cache_config
.
kv_offloading_backend
# If no KVTransferConfig is provided, create a default one.
if
self
.
kv_transfer_config
is
None
:
self
.
kv_transfer_config
=
KVTransferConfig
()
if
(
kv_offloading_size
:
=
self
.
cache_config
.
kv_offloading_size
)
is
None
:
raise
ValueError
(
"You must set kv_offloading_size when kv_offloading_backend is set."
)
num_kv_ranks
=
(
self
.
parallel_config
.
tensor_parallel_size
*
self
.
parallel_config
.
pipeline_parallel_size
...
...
vllm/engine/arg_utils.py
View file @
1be5a735
...
...
@@ -574,9 +574,7 @@ class EngineArgs:
optimization_level
:
OptimizationLevel
=
VllmConfig
.
optimization_level
kv_offloading_size
:
float
|
None
=
CacheConfig
.
kv_offloading_size
kv_offloading_backend
:
KVOffloadingBackend
|
None
=
(
CacheConfig
.
kv_offloading_backend
)
kv_offloading_backend
:
KVOffloadingBackend
=
CacheConfig
.
kv_offloading_backend
tokens_only
:
bool
=
False
def
__post_init__
(
self
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment