Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ade2749c
Commit
ade2749c
authored
Jan 20, 2026
by
laibao
Browse files
feat: kvpress新增 KV 压缩配置开关(默认关闭)
parent
155c8a13
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
72 additions
and
0 deletions
+72
-0
vllm/envs.py
vllm/envs.py
+72
-0
No files found.
vllm/envs.py
View file @
ade2749c
...
@@ -141,6 +141,34 @@ if TYPE_CHECKING:
...
@@ -141,6 +141,34 @@ if TYPE_CHECKING:
VLLM_USE_NVFP4_CT_EMULATIONS
:
bool
=
False
VLLM_USE_NVFP4_CT_EMULATIONS
:
bool
=
False
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION
:
str
=
"NONE"
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION
:
str
=
"NONE"
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16
:
bool
=
True
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16
:
bool
=
True
# KV compression (token-shared) for v1 paged attention.
# When enabled, vLLM decouples logical positions from KV cache positions
# and keeps only a subset of prompt tokens in KV cache during prefill.
VLLM_ENABLE_KV_COMPRESSION
:
bool
=
False
# KV compression policy for selecting which prompt KV entries to retain.
# Currently only "topk" is supported.
VLLM_KV_COMPRESSION_POLICY
:
str
=
"topk"
# Target prompt KV budget for token-shared compression.
# If PROMPT_BUDGET >= 0, it takes precedence over PROMPT_RATIO.
# The budget/ratio applies to non-protected prompt tokens only.
VLLM_KV_COMPRESSION_PROMPT_RATIO
:
float
=
1.0
VLLM_KV_COMPRESSION_PROMPT_BUDGET
:
int
=
-
1
VLLM_KV_COMPRESSION_PROTECTED_PREFIX
:
int
=
0
VLLM_KV_COMPRESSION_PROTECTED_SUFFIX
:
int
=
0
VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN
:
bool
=
True
# SnapKV-like scoring wi这个ndow used by the "topk" policy.
VLLM_KV_COMPRESSION_SNAPKV_WINDOW
:
int
=
32
# Use Triton SnapKV scoring on ROCm (experimental). Set to 0 to force the
# PyTorch reference implementation.
VLLM_KV_COMPRESSION_SNAPKV_USE_TRITON_ROCM
:
bool
=
True
# If set, compute token-shared Top-K selection per attention layer instead
# of sharing a single selection across all layers in a forward pass.
VLLM_KV_COMPRESSION_TOPK_PER_LAYER
:
bool
=
False
# Run KV compaction writeback (reshape_and_cache_*) on a separate CUDA
# stream to overlap with compute (experimental).
VLLM_KV_COMPRESSION_ASYNC_WRITEBACK
:
bool
=
False
# Free unused tail KV cache blocks after prompt compaction (experimental).
VLLM_KV_COMPRESSION_FREE_TAIL_BLOCKS
:
bool
=
True
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB
:
Optional
[
int
]
=
None
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB
:
Optional
[
int
]
=
None
# add envs
# add envs
...
@@ -1055,6 +1083,50 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1055,6 +1083,50 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_TRITON_PREFIX_FLASH_ATTN"
,
"False"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_TRITON_PREFIX_FLASH_ATTN"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# Enable token-shared KV compression for v1 paged attention (experimental).
# This feature currently targets long-prompt prefill memory reduction.
"VLLM_ENABLE_KV_COMPRESSION"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ENABLE_KV_COMPRESSION"
,
"0"
))),
# KV compression policy ("topk").
"VLLM_KV_COMPRESSION_POLICY"
:
lambda
:
os
.
getenv
(
"VLLM_KV_COMPRESSION_POLICY"
,
"topk"
).
lower
(),
# Target fraction of non-protected prompt tokens to keep in KV cache.
"VLLM_KV_COMPRESSION_PROMPT_RATIO"
:
lambda
:
float
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_PROMPT_RATIO"
,
"1.0"
)),
# Target number of non-protected prompt tokens to keep in KV cache.
# If >= 0, this takes precedence over VLLM_KV_COMPRESSION_PROMPT_RATIO.
"VLLM_KV_COMPRESSION_PROMPT_BUDGET"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_PROMPT_BUDGET"
,
"-1"
)),
# Always keep the first N prompt tokens in KV cache (e.g. BOS/system).
"VLLM_KV_COMPRESSION_PROTECTED_PREFIX"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_PROTECTED_PREFIX"
,
"0"
)),
# Always keep the last N prompt tokens in KV cache.
"VLLM_KV_COMPRESSION_PROTECTED_SUFFIX"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_PROTECTED_SUFFIX"
,
"0"
)),
# Always keep the last prompt token (prompt_len - 1) when it is scheduled.
"VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN"
,
"1"
))),
# SnapKV-like scoring window size for the "topk" policy.
"VLLM_KV_COMPRESSION_SNAPKV_WINDOW"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_SNAPKV_WINDOW"
,
"32"
)),
# Enable Triton SnapKV scoring on ROCm (experimental).
"VLLM_KV_COMPRESSION_SNAPKV_USE_TRITON_ROCM"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_SNAPKV_USE_TRITON_ROCM"
,
"1"
))),
# If set, compute token-shared Top-K selection per attention layer instead
# of sharing one selection across layers in a forward pass.
"VLLM_KV_COMPRESSION_TOPK_PER_LAYER"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_TOPK_PER_LAYER"
,
"0"
))),
# If set, run KV compaction writeback on a separate CUDA stream to overlap
# cache writes with compute (experimental).
"VLLM_KV_COMPRESSION_ASYNC_WRITEBACK"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_ASYNC_WRITEBACK"
,
"0"
))),
# If set, free unused tail KV cache blocks after prompt compaction.
"VLLM_KV_COMPRESSION_FREE_TAIL_BLOCKS"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_KV_COMPRESSION_FREE_TAIL_BLOCKS"
,
"0"
))),
# If set, vLLM will use optimized MLA attention optimizations.
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA"
:
"VLLM_USE_TRITON_OPT_MLA"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_TRITON_OPT_MLA"
,
"0"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_TRITON_OPT_MLA"
,
"0"
))),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment