Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8aa1485f
Unverified
Commit
8aa1485f
authored
Jul 28, 2025
by
Lucas Wilkinson
Committed by
GitHub
Jul 28, 2025
Browse files
[Perf] Disable chunked local attention by default with llama4 (#21761)
Signed-off-by:
Lucas Wilkinson
<
lwilkins@redhat.com
>
parent
89ac266b
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
29 additions
and
6 deletions
+29
-6
vllm/config.py
vllm/config.py
+17
-6
vllm/envs.py
vllm/envs.py
+12
-0
No files found.
vllm/config.py
View file @
8aa1485f
...
...
@@ -4769,12 +4769,23 @@ class VllmConfig:
# Hybrid KV cache manager is not compatible with KV events.
self
.
scheduler_config
.
disable_hybrid_kv_cache_manager
=
True
if
self
.
model_config
is
not
None
and
\
self
.
model_config
.
attention_chunk_size
is
not
None
and
\
self
.
speculative_config
is
not
None
and
\
self
.
model_config
.
attention_chunk_size
is
not
None
:
if
self
.
speculative_config
is
not
None
and
\
self
.
speculative_config
.
use_eagle
():
# Hybrid KV cache manager is not yet supported with chunked
# local attention + eagle.
self
.
scheduler_config
.
disable_hybrid_kv_cache_manager
=
True
elif
\
not
envs
.
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE
:
logger
.
warning
(
"There is a latency regression when using chunked local"
" attention with the hybrid KV cache manager. Disabling"
" it, by default. To enable it, set the environment "
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
)
# Hybrid KV cache manager is not yet supported with chunked
# local attention.
self
.
scheduler_config
.
disable_hybrid_kv_cache_manager
=
True
def
update_sizes_for_sequence_parallelism
(
self
,
possible_sizes
:
list
)
->
list
:
...
...
vllm/envs.py
View file @
8aa1485f
...
...
@@ -143,6 +143,7 @@ if TYPE_CHECKING:
VLLM_USE_CUDNN_PREFILL
:
bool
=
False
VLLM_ENABLE_CUDAGRAPH_GC
:
bool
=
False
VLLM_LOOPBACK_IP
:
str
=
""
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE
:
bool
=
False
def
get_default_cache_root
():
...
...
@@ -991,6 +992,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
# The default value is "VLLM".
"VLLM_PROCESS_NAME_PREFIX"
:
lambda
:
os
.
getenv
(
"VLLM_PROCESS_NAME_PREFIX"
,
"VLLM"
),
# Allow chunked local attention with hybrid kv cache manager.
# Currently using the Hybrid KV cache manager with chunked local attention
# in the Llama4 models (the only models currently using chunked local attn)
# causes a latency regression. For this reason, we disable it by default.
# This flag is used to allow users to enable it if they want to (to save on
# kv-cache memory usage and enable longer contexts)
# TODO(lucas): Remove this flag once latency regression is resolved.
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
\
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE"
,
"0"
))),
}
# --8<-- [end:env-vars-definition]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment