Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4ffd963f
Unverified
Commit
4ffd963f
authored
Jul 15, 2025
by
Christian Pinto
Committed by
GitHub
Jul 15, 2025
Browse files
[v1][core] Support for attention free models (#20811)
Signed-off-by:
Christian Pinto
<
christian.pinto@ibm.com
>
parent
56fe4bed
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
33 additions
and
3 deletions
+33
-3
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_manager.py
+6
-1
vllm/v1/core/kv_cache_utils.py
vllm/v1/core/kv_cache_utils.py
+20
-1
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+7
-1
No files found.
vllm/v1/core/kv_cache_manager.py
View file @
4ffd963f
...
@@ -78,7 +78,12 @@ class KVCacheManager:
...
@@ -78,7 +78,12 @@ class KVCacheManager:
)
->
None
:
)
->
None
:
self
.
max_model_len
=
max_model_len
self
.
max_model_len
=
max_model_len
if
len
(
kv_cache_config
.
kv_cache_groups
)
==
0
:
# Attention free models don't have kv cache,
# thus don't need prefix caching.
enable_caching
=
False
self
.
enable_caching
=
enable_caching
self
.
enable_caching
=
enable_caching
self
.
caching_hash_fn
=
(
self
.
caching_hash_fn
=
(
sha256_cbor_64bit
if
caching_hash_algo
==
"sha256_cbor_64bit"
else
sha256_cbor_64bit
if
caching_hash_algo
==
"sha256_cbor_64bit"
else
sha256
if
caching_hash_algo
==
"sha256"
else
hash
)
sha256
if
caching_hash_algo
==
"sha256"
else
hash
)
...
@@ -101,7 +106,7 @@ class KVCacheManager:
...
@@ -101,7 +106,7 @@ class KVCacheManager:
kv_cache_config
=
kv_cache_config
,
kv_cache_config
=
kv_cache_config
,
max_model_len
=
self
.
max_model_len
,
max_model_len
=
self
.
max_model_len
,
use_eagle
=
self
.
use_eagle
,
use_eagle
=
self
.
use_eagle
,
enable_caching
=
enable_caching
,
enable_caching
=
self
.
enable_caching
,
caching_hash_fn
=
self
.
caching_hash_fn
,
caching_hash_fn
=
self
.
caching_hash_fn
,
enable_kv_cache_events
=
enable_kv_cache_events
,
enable_kv_cache_events
=
enable_kv_cache_events
,
)
)
...
...
vllm/v1/core/kv_cache_utils.py
View file @
4ffd963f
...
@@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
...
@@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
ValueError: If there is not enough memory available for the KV cache.
ValueError: If there is not enough memory available for the KV cache.
"""
"""
# No need to check for available memory if the kv_cache_spec is empty
if
not
kv_cache_spec
:
return
if
available_memory
<=
0
:
if
available_memory
<=
0
:
raise
ValueError
(
"No available memory for the cache blocks. "
raise
ValueError
(
"No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"Try increasing `gpu_memory_utilization` when "
...
@@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
...
@@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
return
len
(
page_sizes
)
==
1
return
len
(
page_sizes
)
==
1
def
is_kv_cache_type_attention_free
(
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
])
->
bool
:
# kv_cache_spec is an empty dict for attention free models
return
not
kv_cache_spec
def
_get_kv_cache_config_uniform_page_size
(
def
_get_kv_cache_config_uniform_page_size
(
vllm_config
:
VllmConfig
,
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
],
vllm_config
:
VllmConfig
,
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
],
available_memory
:
int
)
->
KVCacheConfig
:
available_memory
:
int
)
->
KVCacheConfig
:
...
@@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
...
@@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
return
kv_cache_config
return
kv_cache_config
def
_get_kv_cache_config_attention_free
()
->
KVCacheConfig
:
return
KVCacheConfig
(
num_blocks
=
1
,
kv_cache_tensors
=
[],
kv_cache_groups
=
[])
def
unify_hybrid_kv_cache_specs
(
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
]):
def
unify_hybrid_kv_cache_specs
(
kv_cache_spec
:
dict
[
str
,
KVCacheSpec
]):
"""
"""
This function tries to convert the KV cache specs to one type if the model
This function tries to convert the KV cache specs to one type if the model
...
@@ -957,7 +972,11 @@ def get_kv_cache_config(
...
@@ -957,7 +972,11 @@ def get_kv_cache_config(
if
vllm_config
.
scheduler_config
.
disable_hybrid_kv_cache_manager
:
if
vllm_config
.
scheduler_config
.
disable_hybrid_kv_cache_manager
:
unify_hybrid_kv_cache_specs
(
kv_cache_spec
)
unify_hybrid_kv_cache_specs
(
kv_cache_spec
)
if
is_kv_cache_type_uniform
(
kv_cache_spec
):
if
is_kv_cache_type_attention_free
(
kv_cache_spec
):
# This returns a kv_cache config with 0 kv_cache groups and 1 block
# to allow for the KVCache manager to handle attention free models.
return
_get_kv_cache_config_attention_free
()
elif
is_kv_cache_type_uniform
(
kv_cache_spec
):
# KV cache of all layers are the same, which is true for
# KV cache of all layers are the same, which is true for
# most models. Allocate the same amount of memory for
# most models. Allocate the same amount of memory for
# each layer.
# each layer.
...
...
vllm/v1/engine/core.py
View file @
4ffd963f
...
@@ -139,7 +139,13 @@ class EngineCore:
...
@@ -139,7 +139,13 @@ class EngineCore:
# Profiles the peak memory usage of the model to determine how much
# Profiles the peak memory usage of the model to determine how much
# memory can be allocated for kv cache.
# memory can be allocated for kv cache.
available_gpu_memory
=
self
.
model_executor
.
determine_available_memory
()
has_kv_cache
=
any
(
kv_cache_spec
for
kv_cache_spec
in
kv_cache_specs
)
if
has_kv_cache
:
available_gpu_memory
=
\
self
.
model_executor
.
determine_available_memory
()
else
:
# Attention free models don't need memory for kv cache
available_gpu_memory
=
[
0
]
*
len
(
kv_cache_specs
)
assert
len
(
kv_cache_specs
)
==
len
(
available_gpu_memory
)
assert
len
(
kv_cache_specs
)
==
len
(
available_gpu_memory
)
# Get the kv cache tensor size
# Get the kv cache tensor size
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment