Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4ffd963f
Unverified
Commit
4ffd963f
authored
Jul 15, 2025
by
Christian Pinto
Committed by
GitHub
Jul 15, 2025
Browse files
[v1][core] Support for attention free models (#20811)
Signed-off-by:
Christian Pinto
<
christian.pinto@ibm.com
>
parent
56fe4bed
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
33 additions
and
3 deletions
+33
-3
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_manager.py
+6
-1
vllm/v1/core/kv_cache_utils.py
vllm/v1/core/kv_cache_utils.py
+20
-1
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+7
-1
No files found.
vllm/v1/core/kv_cache_manager.py
View file @
4ffd963f
...
...
@@ -78,7 +78,12 @@ class KVCacheManager:
)
->
None
:
self
.
max_model_len
=
max_model_len
if
len
(
kv_cache_config
.
kv_cache_groups
)
==
0
:
# Attention free models don't have kv cache,
# thus don't need prefix caching.
enable_caching
=
False
self
.
enable_caching
=
enable_caching
self
.
caching_hash_fn
=
(
sha256_cbor_64bit
if
caching_hash_algo
==
"sha256_cbor_64bit"
else
sha256
if
caching_hash_algo
==
"sha256"
else
hash
)
...
...
@@ -101,7 +106,7 @@ class KVCacheManager:
kv_cache_config
=
kv_cache_config
,
max_model_len
=
self
.
max_model_len
,
use_eagle
=
self
.
use_eagle
,
enable_caching
=
enable_caching
,
enable_caching
=
self
.
enable_caching
,
caching_hash_fn
=
self
.
caching_hash_fn
,
enable_kv_cache_events
=
enable_kv_cache_events
,
)
...
...
vllm/v1/core/kv_cache_utils.py
View file @
4ffd963f
...
...
@@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
ValueError: If there is not enough memory available for the KV cache.
"""
# No need to check for available memory if the kv_cache_spec is empty
if
not
kv_cache_spec
:
return
if
available_memory
<=
0
:
raise
ValueError
(
"No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
...
...
@@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
return
len
(
page_sizes
)
==
1
def
is_kv_cache_type_attention_free
(
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
])
->
bool
:
# kv_cache_spec is an empty dict for attention free models
return
not
kv_cache_spec
def
_get_kv_cache_config_uniform_page_size
(
vllm_config
:
VllmConfig
,
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
],
available_memory
:
int
)
->
KVCacheConfig
:
...
...
@@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
return
kv_cache_config
def
_get_kv_cache_config_attention_free
()
->
KVCacheConfig
:
return
KVCacheConfig
(
num_blocks
=
1
,
kv_cache_tensors
=
[],
kv_cache_groups
=
[])
def
unify_hybrid_kv_cache_specs
(
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
]):
"""
This function tries to convert the KV cache specs to one type if the model
...
...
@@ -957,7 +972,11 @@ def get_kv_cache_config(
if
vllm_config
.
scheduler_config
.
disable_hybrid_kv_cache_manager
:
unify_hybrid_kv_cache_specs
(
kv_cache_spec
)
if
is_kv_cache_type_uniform
(
kv_cache_spec
):
if
is_kv_cache_type_attention_free
(
kv_cache_spec
):
# This returns a kv_cache config with 0 kv_cache groups and 1 block
# to allow for the KVCache manager to handle attention free models.
return
_get_kv_cache_config_attention_free
()
elif
is_kv_cache_type_uniform
(
kv_cache_spec
):
# KV cache of all layers are the same, which is true for
# most models. Allocate the same amount of memory for
# each layer.
...
...
vllm/v1/engine/core.py
View file @
4ffd963f
...
...
@@ -139,7 +139,13 @@ class EngineCore:
# Profiles the peak memory usage of the model to determine how much
# memory can be allocated for kv cache.
available_gpu_memory
=
self
.
model_executor
.
determine_available_memory
()
has_kv_cache
=
any
(
kv_cache_spec
for
kv_cache_spec
in
kv_cache_specs
)
if
has_kv_cache
:
available_gpu_memory
=
\
self
.
model_executor
.
determine_available_memory
()
else
:
# Attention free models don't need memory for kv cache
available_gpu_memory
=
[
0
]
*
len
(
kv_cache_specs
)
assert
len
(
kv_cache_specs
)
==
len
(
available_gpu_memory
)
# Get the kv cache tensor size
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment