Commit 3d4f8753 authored by laibao's avatar laibao
Browse files

feat(kvpress): KVCacheManager 按 num_kv_tokens 分配 slots

parent b0911b24
......@@ -6,6 +6,8 @@ from collections.abc import Sequence
from dataclasses import dataclass
from typing import Literal, overload
import vllm.envs as envs
from vllm.platforms import current_platform
from vllm.distributed.kv_events import KVCacheEvent
from vllm.logger import init_logger
from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
......@@ -307,6 +309,19 @@ class KVCacheManager:
num_local_computed_tokens + num_external_computed_tokens,
self.max_model_len,
)
if envs.VLLM_ENABLE_KV_COMPRESSION and current_platform.is_cuda_alike():
# KV compression decouples logical token positions from KV cache
# positions. Allocate based on the KV cache length (plus the tokens
# scheduled for this step, which are temporarily written to cache).
#
# NOTE: For new requests, request.num_kv_tokens may not have been
# initialized yet (e.g., cache-hit / connector paths). Fall back
# to the computed-token count in that case.
kv_computed_tokens = int(request.num_kv_tokens)
if kv_computed_tokens == 0 and total_computed_tokens > 0:
kv_computed_tokens = int(total_computed_tokens)
num_tokens_main_model = kv_computed_tokens + num_new_tokens
else:
num_tokens_main_model = total_computed_tokens + num_new_tokens
num_tokens_need_slot = min(
num_tokens_main_model + num_lookahead_tokens,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment