Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3c529fc9
Unverified
Commit
3c529fc9
authored
Sep 05, 2025
by
Yong Hoon Shin
Committed by
GitHub
Sep 05, 2025
Browse files
[KV Sharing] Raise error if using eagle with fast prefill (#24350)
Signed-off-by:
Yong Hoon Shin
<
yhshin@meta.com
>
parent
35bf1938
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
7 deletions
+18
-7
vllm/config/__init__.py
vllm/config/__init__.py
+18
-0
vllm/config/cache.py
vllm/config/cache.py
+0
-7
No files found.
vllm/config/__init__.py
View file @
3c529fc9
...
...
@@ -3665,6 +3665,24 @@ class VllmConfig:
" Disabling `torch.compile`."
)
self
.
compilation_config
.
level
=
CompilationLevel
.
NO_COMPILATION
if
self
.
cache_config
.
kv_sharing_fast_prefill
:
if
not
envs
.
VLLM_USE_V1
:
raise
NotImplementedError
(
"Fast prefill optimization for KV sharing is not supported "
"in V0 currently."
)
if
self
.
speculative_config
is
not
None
and
\
self
.
speculative_config
.
use_eagle
():
raise
NotImplementedError
(
"Fast prefill optimization for KV sharing is not "
"compatible with EAGLE as EAGLE requires correct logits "
"for all tokens while fast prefill gives incorrect logits "
"for prompt tokens."
)
logger
.
warning_once
(
"--kv-sharing-fast-prefill requires changes on model side for "
"correctness and to realize prefill savings. "
)
if
((
not
envs
.
VLLM_USE_V1
)
and
self
.
lora_config
is
not
None
and
self
.
compilation_config
.
level
!=
CompilationLevel
.
NO_COMPILATION
):
...
...
vllm/config/cache.py
View file @
3c529fc9
...
...
@@ -145,19 +145,12 @@ class CacheConfig:
self
.
_verify_cache_dtype
()
self
.
_verify_prefix_caching
()
self
.
_verify_kv_sharing_fast_prefill
()
def
metrics_info
(
self
):
# convert cache_config to dict(key: str, value: str) for prometheus
# metrics info
return
{
key
:
str
(
value
)
for
key
,
value
in
self
.
__dict__
.
items
()}
def
_verify_kv_sharing_fast_prefill
(
self
)
->
None
:
if
self
.
kv_sharing_fast_prefill
and
not
envs
.
VLLM_USE_V1
:
raise
NotImplementedError
(
"Fast prefill optimization for KV sharing is not supported "
"in V0 currently."
)
@
model_validator
(
mode
=
'after'
)
def
_verify_args
(
self
)
->
Self
:
if
self
.
cpu_offload_gb
<
0
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment