[v1][core] Support for attention free models (#20811)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>

[v1][core] Support for attention free models (#20811)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
4ffd963f · Christian Pinto · GitHub · 56fe4bed · 4ffd963f · 4ffd963f
Unverified Commit 4ffd963f authored Jul 15, 2025 by Christian Pinto Committed by GitHub Jul 15, 2025
Showing with 33 additions and 3 deletions

vllm/v1/core/kv_cache_manager.py vllm/v1/core/kv_cache_manager.py +6 -1

vllm/v1/core/kv_cache_utils.py vllm/v1/core/kv_cache_utils.py +20 -1

vllm/v1/engine/core.py vllm/v1/engine/core.py +7 -1

No files found.
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -78,7 +78,12 @@ class KVCacheManager:
    ) -> None:
        self.max_model_len = max_model_len

+        if len(kv_cache_config.kv_cache_groups) == 0:
+            # Attention free models don't have kv cache,
+            # thus don't need prefix caching.
+            enable_caching = False
        self.enable_caching = enable_caching
+
        self.caching_hash_fn = (
            sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
            sha256 if caching_hash_algo == "sha256" else hash)
@@ -101,7 +106,7 @@ class KVCacheManager:
            kv_cache_config=kv_cache_config,
            max_model_len=self.max_model_len,
            use_eagle=self.use_eagle,
-            enable_caching=enable_caching,
+            enable_caching=self.enable_caching,
            caching_hash_fn=self.caching_hash_fn,
            enable_kv_cache_events=enable_kv_cache_events,
        )

--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
        ValueError: If there is not enough memory available for the KV cache.
    """

+    # No need to check for available memory if the kv_cache_spec is empty
+    if not kv_cache_spec:
+        return
+
    if available_memory <= 0:
        raise ValueError("No available memory for the cache blocks. "
                         "Try increasing `gpu_memory_utilization` when "
@@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
    return len(page_sizes) == 1


+def is_kv_cache_type_attention_free(
+        kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+
+    # kv_cache_spec is an empty dict for attention free models
+    return not kv_cache_spec
+
+
 def _get_kv_cache_config_uniform_page_size(
        vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
        available_memory: int) -> KVCacheConfig:
@@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
    return kv_cache_config


+def _get_kv_cache_config_attention_free() -> KVCacheConfig:
+    return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
+
+
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
    """
    This function tries to convert the KV cache specs to one type if the model
@@ -957,7 +972,11 @@ def get_kv_cache_config(
    if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
        unify_hybrid_kv_cache_specs(kv_cache_spec)

-    if is_kv_cache_type_uniform(kv_cache_spec):
+    if is_kv_cache_type_attention_free(kv_cache_spec):
+        # This returns a kv_cache config with 0 kv_cache groups and 1 block
+        # to allow for the KVCache manager to handle attention free models.
+        return _get_kv_cache_config_attention_free()
+    elif is_kv_cache_type_uniform(kv_cache_spec):
        # KV cache of all layers are the same, which is true for
        # most models. Allocate the same amount of memory for
        # each layer.

--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -139,7 +139,13 @@ class EngineCore:

        # Profiles the peak memory usage of the model to determine how much
        # memory can be allocated for kv cache.
-        available_gpu_memory = self.model_executor.determine_available_memory()
+        has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
+        if has_kv_cache:
+            available_gpu_memory = \
+                self.model_executor.determine_available_memory()
+        else:
+            # Attention free models don't need memory for kv cache
+            available_gpu_memory = [0] * len(kv_cache_specs)

        assert len(kv_cache_specs) == len(available_gpu_memory)
        # Get the kv cache tensor size