Commit ade2749c authored by laibao's avatar laibao
Browse files

feat: kvpress新增 KV 压缩配置开关(默认关闭)

parent 155c8a13
...@@ -141,6 +141,34 @@ if TYPE_CHECKING: ...@@ -141,6 +141,34 @@ if TYPE_CHECKING:
VLLM_USE_NVFP4_CT_EMULATIONS: bool = False VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE" VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE"
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
# KV compression (token-shared) for v1 paged attention.
# When enabled, vLLM decouples logical positions from KV cache positions
# and keeps only a subset of prompt tokens in KV cache during prefill.
VLLM_ENABLE_KV_COMPRESSION: bool = False
# KV compression policy for selecting which prompt KV entries to retain.
# Currently only "topk" is supported.
VLLM_KV_COMPRESSION_POLICY: str = "topk"
# Target prompt KV budget for token-shared compression.
# If PROMPT_BUDGET >= 0, it takes precedence over PROMPT_RATIO.
# The budget/ratio applies to non-protected prompt tokens only.
VLLM_KV_COMPRESSION_PROMPT_RATIO: float = 1.0
VLLM_KV_COMPRESSION_PROMPT_BUDGET: int = -1
VLLM_KV_COMPRESSION_PROTECTED_PREFIX: int = 0
VLLM_KV_COMPRESSION_PROTECTED_SUFFIX: int = 0
VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN: bool = True
# SnapKV-like scoring wi这个ndow used by the "topk" policy.
VLLM_KV_COMPRESSION_SNAPKV_WINDOW: int = 32
# Use Triton SnapKV scoring on ROCm (experimental). Set to 0 to force the
# PyTorch reference implementation.
VLLM_KV_COMPRESSION_SNAPKV_USE_TRITON_ROCM: bool = True
# If set, compute token-shared Top-K selection per attention layer instead
# of sharing a single selection across all layers in a forward pass.
VLLM_KV_COMPRESSION_TOPK_PER_LAYER: bool = False
# Run KV compaction writeback (reshape_and_cache_*) on a separate CUDA
# stream to overlap with compute (experimental).
VLLM_KV_COMPRESSION_ASYNC_WRITEBACK: bool = False
# Free unused tail KV cache blocks after prompt compaction (experimental).
VLLM_KV_COMPRESSION_FREE_TAIL_BLOCKS: bool = True
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
# add envs # add envs
...@@ -1055,6 +1083,50 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1055,6 +1083,50 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (os.environ.get("VLLM_USE_TRITON_PREFIX_FLASH_ATTN", "False").lower() in lambda: (os.environ.get("VLLM_USE_TRITON_PREFIX_FLASH_ATTN", "False").lower() in
("true", "1")), ("true", "1")),
# Enable token-shared KV compression for v1 paged attention (experimental).
# This feature currently targets long-prompt prefill memory reduction.
"VLLM_ENABLE_KV_COMPRESSION":
lambda: bool(int(os.getenv("VLLM_ENABLE_KV_COMPRESSION", "0"))),
# KV compression policy ("topk").
"VLLM_KV_COMPRESSION_POLICY":
lambda: os.getenv("VLLM_KV_COMPRESSION_POLICY", "topk").lower(),
# Target fraction of non-protected prompt tokens to keep in KV cache.
"VLLM_KV_COMPRESSION_PROMPT_RATIO":
lambda: float(os.getenv("VLLM_KV_COMPRESSION_PROMPT_RATIO", "1.0")),
# Target number of non-protected prompt tokens to keep in KV cache.
# If >= 0, this takes precedence over VLLM_KV_COMPRESSION_PROMPT_RATIO.
"VLLM_KV_COMPRESSION_PROMPT_BUDGET":
lambda: int(os.getenv("VLLM_KV_COMPRESSION_PROMPT_BUDGET", "-1")),
# Always keep the first N prompt tokens in KV cache (e.g. BOS/system).
"VLLM_KV_COMPRESSION_PROTECTED_PREFIX":
lambda: int(os.getenv("VLLM_KV_COMPRESSION_PROTECTED_PREFIX", "0")),
# Always keep the last N prompt tokens in KV cache.
"VLLM_KV_COMPRESSION_PROTECTED_SUFFIX":
lambda: int(os.getenv("VLLM_KV_COMPRESSION_PROTECTED_SUFFIX", "0")),
# Always keep the last prompt token (prompt_len - 1) when it is scheduled.
"VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN":
lambda: bool(int(os.getenv("VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN", "1"))),
# SnapKV-like scoring window size for the "topk" policy.
"VLLM_KV_COMPRESSION_SNAPKV_WINDOW":
lambda: int(os.getenv("VLLM_KV_COMPRESSION_SNAPKV_WINDOW", "32")),
# Enable Triton SnapKV scoring on ROCm (experimental).
"VLLM_KV_COMPRESSION_SNAPKV_USE_TRITON_ROCM":
lambda: bool(
int(os.getenv("VLLM_KV_COMPRESSION_SNAPKV_USE_TRITON_ROCM", "1"))),
# If set, compute token-shared Top-K selection per attention layer instead
# of sharing one selection across layers in a forward pass.
"VLLM_KV_COMPRESSION_TOPK_PER_LAYER":
lambda: bool(int(os.getenv("VLLM_KV_COMPRESSION_TOPK_PER_LAYER", "0"))),
# If set, run KV compaction writeback on a separate CUDA stream to overlap
# cache writes with compute (experimental).
"VLLM_KV_COMPRESSION_ASYNC_WRITEBACK":
lambda: bool(
int(os.getenv("VLLM_KV_COMPRESSION_ASYNC_WRITEBACK", "0"))),
# If set, free unused tail KV cache blocks after prompt compaction.
"VLLM_KV_COMPRESSION_FREE_TAIL_BLOCKS":
lambda: bool(
int(os.getenv("VLLM_KV_COMPRESSION_FREE_TAIL_BLOCKS", "0"))),
# If set, vLLM will use optimized MLA attention optimizations. # If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA": "VLLM_USE_TRITON_OPT_MLA":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))), lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment