feat(kvpress): 新增 KV 压缩环境变量开关与参数

cafabeeb · laibao · 2799735a · cafabeeb
Commit cafabeeb authored Feb 24, 2026 by laibao
Show whitespace changes
Inline Side-by-side

Showing with 81 additions and 0 deletions

vllm/envs.py vllm/envs.py +81 -0

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -253,6 +253,35 @@ if TYPE_CHECKING:
    VLLM_DISABLE_LOG_LOGO: bool = False
    VLLM_LORA_DISABLE_PDL: bool = False
+    # KV compression (token-shared) for v1 paged attention (experimental).
+    # When enabled, vLLM decouples logical positions from KV cache positions
+    # and keeps only a subset of prompt tokens in KV cache during prefill.
+    VLLM_ENABLE_KV_COMPRESSION: bool = False
+    # KV compression policy for selecting which prompt KV entries to retain.
+    # Currently only "topk" is supported.
+    VLLM_KV_COMPRESSION_POLICY: str = "topk"
+    # Target prompt KV budget for token-shared compression.
+    # If PROMPT_BUDGET >= 0, it takes precedence over PROMPT_RATIO.
+    # The budget/ratio applies to non-protected prompt tokens only.
+    VLLM_KV_COMPRESSION_PROMPT_RATIO: float = 1.0
+    VLLM_KV_COMPRESSION_PROMPT_BUDGET: int = -1
+    VLLM_KV_COMPRESSION_PROTECTED_PREFIX: int = 0
+    VLLM_KV_COMPRESSION_PROTECTED_SUFFIX: int = 0
+    VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN: bool = True
+    # SnapKV-like scoring window used by the "topk" policy.
+    VLLM_KV_COMPRESSION_SNAPKV_WINDOW: int = 32
+    # Use Triton SnapKV scoring on ROCm (experimental). Set to 0 to force the
+    # PyTorch reference implementation.
+    VLLM_KV_COMPRESSION_SNAPKV_USE_TRITON_ROCM: bool = True
+    # If set, compute token-shared Top-K selection per attention layer instead
+    # of sharing a single selection across all layers in a forward pass.
+    VLLM_KV_COMPRESSION_TOPK_PER_LAYER: bool = False
+    # Run KV compaction writeback (reshape_and_cache_*) on a separate CUDA
+    # stream to overlap with compute (experimental).
+    VLLM_KV_COMPRESSION_ASYNC_WRITEBACK: bool = False
+    # Free unused tail KV cache blocks after prompt compaction (experimental).
+    VLLM_KV_COMPRESSION_FREE_TAIL_BLOCKS: bool = False
    # add envs
    VLLM_OPTEST_URLS_PORT: int | None = None
    VLLM_OPTEST_MODELS_PATH: str = ""
@@ -1847,6 +1876,58 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_MOE_W16A16_TRITON":
        lambda: (os.environ.get("VLLM_USE_MOE_W16A16_TRITON", "0").lower() in
                 ("true", "1")),
+    # Enable token-shared KV compression for v1 paged attention (experimental).
+    "VLLM_ENABLE_KV_COMPRESSION": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_KV_COMPRESSION", "0"))
+    ),
+    # KV compression policy ("topk").
+    "VLLM_KV_COMPRESSION_POLICY": lambda: os.getenv(
+        "VLLM_KV_COMPRESSION_POLICY", "topk"
+    ).lower(),
+    # Target fraction of non-protected prompt tokens to keep in KV cache.
+    "VLLM_KV_COMPRESSION_PROMPT_RATIO": lambda: float(
+        os.getenv("VLLM_KV_COMPRESSION_PROMPT_RATIO", "1.0")
+    ),
+    # Target number of non-protected prompt tokens to keep in KV cache.
+    # If >= 0, this takes precedence over VLLM_KV_COMPRESSION_PROMPT_RATIO.
+    "VLLM_KV_COMPRESSION_PROMPT_BUDGET": lambda: int(
+        os.getenv("VLLM_KV_COMPRESSION_PROMPT_BUDGET", "-1")
+    ),
+    # Always keep the first N prompt tokens in KV cache (e.g. BOS/system).
+    "VLLM_KV_COMPRESSION_PROTECTED_PREFIX": lambda: int(
+        os.getenv("VLLM_KV_COMPRESSION_PROTECTED_PREFIX", "0")
+    ),
+    # Always keep the last N prompt tokens in KV cache.
+    "VLLM_KV_COMPRESSION_PROTECTED_SUFFIX": lambda: int(
+        os.getenv("VLLM_KV_COMPRESSION_PROTECTED_SUFFIX", "0")
+    ),
+    # Always keep the last prompt token (prompt_len - 1) when it is scheduled.
+    "VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN": lambda: bool(
+        int(os.getenv("VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN", "1"))
+    ),
+    # SnapKV-like scoring window size for the "topk" policy.
+    "VLLM_KV_COMPRESSION_SNAPKV_WINDOW": lambda: int(
+        os.getenv("VLLM_KV_COMPRESSION_SNAPKV_WINDOW", "32")
+    ),
+    # Enable Triton SnapKV scoring on ROCm (experimental).
+    "VLLM_KV_COMPRESSION_SNAPKV_USE_TRITON_ROCM": lambda: bool(
+        int(os.getenv("VLLM_KV_COMPRESSION_SNAPKV_USE_TRITON_ROCM", "1"))
+    ),
+    # If set, compute token-shared Top-K selection per attention layer instead
+    # of sharing one selection across layers in a forward pass.
+    "VLLM_KV_COMPRESSION_TOPK_PER_LAYER": lambda: bool(
+        int(os.getenv("VLLM_KV_COMPRESSION_TOPK_PER_LAYER", "0"))
+    ),
+    # If set, run KV compaction writeback on a separate CUDA stream to overlap
+    # cache writes with compute (experimental).
+    "VLLM_KV_COMPRESSION_ASYNC_WRITEBACK": lambda: bool(
+        int(os.getenv("VLLM_KV_COMPRESSION_ASYNC_WRITEBACK", "0"))
+    ),
+    # If set, free unused tail KV cache blocks after prompt compaction.
+    "VLLM_KV_COMPRESSION_FREE_TAIL_BLOCKS": lambda: bool(
+        int(os.getenv("VLLM_KV_COMPRESSION_FREE_TAIL_BLOCKS", "0"))
+    ),
 }
 # --8<-- [end:env-vars-definition]