Unverified Commit 4ffd963f authored by Christian Pinto's avatar Christian Pinto Committed by GitHub
Browse files

[v1][core] Support for attention free models (#20811)


Signed-off-by: default avatarChristian Pinto <christian.pinto@ibm.com>
parent 56fe4bed
...@@ -78,7 +78,12 @@ class KVCacheManager: ...@@ -78,7 +78,12 @@ class KVCacheManager:
) -> None: ) -> None:
self.max_model_len = max_model_len self.max_model_len = max_model_len
if len(kv_cache_config.kv_cache_groups) == 0:
# Attention free models don't have kv cache,
# thus don't need prefix caching.
enable_caching = False
self.enable_caching = enable_caching self.enable_caching = enable_caching
self.caching_hash_fn = ( self.caching_hash_fn = (
sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
sha256 if caching_hash_algo == "sha256" else hash) sha256 if caching_hash_algo == "sha256" else hash)
...@@ -101,7 +106,7 @@ class KVCacheManager: ...@@ -101,7 +106,7 @@ class KVCacheManager:
kv_cache_config=kv_cache_config, kv_cache_config=kv_cache_config,
max_model_len=self.max_model_len, max_model_len=self.max_model_len,
use_eagle=self.use_eagle, use_eagle=self.use_eagle,
enable_caching=enable_caching, enable_caching=self.enable_caching,
caching_hash_fn=self.caching_hash_fn, caching_hash_fn=self.caching_hash_fn,
enable_kv_cache_events=enable_kv_cache_events, enable_kv_cache_events=enable_kv_cache_events,
) )
......
...@@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, ...@@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
ValueError: If there is not enough memory available for the KV cache. ValueError: If there is not enough memory available for the KV cache.
""" """
# No need to check for available memory if the kv_cache_spec is empty
if not kv_cache_spec:
return
if available_memory <= 0: if available_memory <= 0:
raise ValueError("No available memory for the cache blocks. " raise ValueError("No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when " "Try increasing `gpu_memory_utilization` when "
...@@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform( ...@@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
return len(page_sizes) == 1 return len(page_sizes) == 1
def is_kv_cache_type_attention_free(
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
# kv_cache_spec is an empty dict for attention free models
return not kv_cache_spec
def _get_kv_cache_config_uniform_page_size( def _get_kv_cache_config_uniform_page_size(
vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
available_memory: int) -> KVCacheConfig: available_memory: int) -> KVCacheConfig:
...@@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size( ...@@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
return kv_cache_config return kv_cache_config
def _get_kv_cache_config_attention_free() -> KVCacheConfig:
return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
""" """
This function tries to convert the KV cache specs to one type if the model This function tries to convert the KV cache specs to one type if the model
...@@ -957,7 +972,11 @@ def get_kv_cache_config( ...@@ -957,7 +972,11 @@ def get_kv_cache_config(
if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager: if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
unify_hybrid_kv_cache_specs(kv_cache_spec) unify_hybrid_kv_cache_specs(kv_cache_spec)
if is_kv_cache_type_uniform(kv_cache_spec): if is_kv_cache_type_attention_free(kv_cache_spec):
# This returns a kv_cache config with 0 kv_cache groups and 1 block
# to allow for the KVCache manager to handle attention free models.
return _get_kv_cache_config_attention_free()
elif is_kv_cache_type_uniform(kv_cache_spec):
# KV cache of all layers are the same, which is true for # KV cache of all layers are the same, which is true for
# most models. Allocate the same amount of memory for # most models. Allocate the same amount of memory for
# each layer. # each layer.
......
...@@ -139,7 +139,13 @@ class EngineCore: ...@@ -139,7 +139,13 @@ class EngineCore:
# Profiles the peak memory usage of the model to determine how much # Profiles the peak memory usage of the model to determine how much
# memory can be allocated for kv cache. # memory can be allocated for kv cache.
available_gpu_memory = self.model_executor.determine_available_memory() has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
if has_kv_cache:
available_gpu_memory = \
self.model_executor.determine_available_memory()
else:
# Attention free models don't need memory for kv cache
available_gpu_memory = [0] * len(kv_cache_specs)
assert len(kv_cache_specs) == len(available_gpu_memory) assert len(kv_cache_specs) == len(available_gpu_memory)
# Get the kv cache tensor size # Get the kv cache tensor size
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment