feat(kvpress): KVCacheManager 按 num_kv_tokens 分配 slots

3d4f8753 · laibao · b0911b24 · 3d4f8753
Commit 3d4f8753 authored Feb 24, 2026 by laibao
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 1 deletion

vllm/v1/core/kv_cache_manager.py vllm/v1/core/kv_cache_manager.py +16 -1

No files found.
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -6,6 +6,8 @@ from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import Literal, overload

+import vllm.envs as envs
+from vllm.platforms import current_platform
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
@@ -307,7 +309,20 @@ class KVCacheManager:
            num_local_computed_tokens + num_external_computed_tokens,
            self.max_model_len,
        )
-        num_tokens_main_model = total_computed_tokens + num_new_tokens
+        if envs.VLLM_ENABLE_KV_COMPRESSION and current_platform.is_cuda_alike():
+            # KV compression decouples logical token positions from KV cache
+            # positions. Allocate based on the KV cache length (plus the tokens
+            # scheduled for this step, which are temporarily written to cache).
+            #
+            # NOTE: For new requests, request.num_kv_tokens may not have been
+            # initialized yet (e.g., cache-hit / connector paths). Fall back
+            # to the computed-token count in that case.
+            kv_computed_tokens = int(request.num_kv_tokens)
+            if kv_computed_tokens == 0 and total_computed_tokens > 0:
+                kv_computed_tokens = int(total_computed_tokens)
+            num_tokens_main_model = kv_computed_tokens + num_new_tokens
+        else:
+            num_tokens_main_model = total_computed_tokens + num_new_tokens
        num_tokens_need_slot = min(
            num_tokens_main_model + num_lookahead_tokens,
            self.max_model_len,