Unverified Commit 3c529fc9 authored by Yong Hoon Shin's avatar Yong Hoon Shin Committed by GitHub
Browse files

[KV Sharing] Raise error if using eagle with fast prefill (#24350)


Signed-off-by: default avatarYong Hoon Shin <yhshin@meta.com>
parent 35bf1938
...@@ -3665,6 +3665,24 @@ class VllmConfig: ...@@ -3665,6 +3665,24 @@ class VllmConfig:
" Disabling `torch.compile`.") " Disabling `torch.compile`.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION self.compilation_config.level = CompilationLevel.NO_COMPILATION
if self.cache_config.kv_sharing_fast_prefill:
if not envs.VLLM_USE_V1:
raise NotImplementedError(
"Fast prefill optimization for KV sharing is not supported "
"in V0 currently.")
if self.speculative_config is not None and \
self.speculative_config.use_eagle():
raise NotImplementedError(
"Fast prefill optimization for KV sharing is not "
"compatible with EAGLE as EAGLE requires correct logits "
"for all tokens while fast prefill gives incorrect logits "
"for prompt tokens.")
logger.warning_once(
"--kv-sharing-fast-prefill requires changes on model side for "
"correctness and to realize prefill savings. ")
if ((not envs.VLLM_USE_V1) and self.lora_config is not None if ((not envs.VLLM_USE_V1) and self.lora_config is not None
and self.compilation_config.level and self.compilation_config.level
!= CompilationLevel.NO_COMPILATION): != CompilationLevel.NO_COMPILATION):
......
...@@ -145,19 +145,12 @@ class CacheConfig: ...@@ -145,19 +145,12 @@ class CacheConfig:
self._verify_cache_dtype() self._verify_cache_dtype()
self._verify_prefix_caching() self._verify_prefix_caching()
self._verify_kv_sharing_fast_prefill()
def metrics_info(self): def metrics_info(self):
# convert cache_config to dict(key: str, value: str) for prometheus # convert cache_config to dict(key: str, value: str) for prometheus
# metrics info # metrics info
return {key: str(value) for key, value in self.__dict__.items()} return {key: str(value) for key, value in self.__dict__.items()}
def _verify_kv_sharing_fast_prefill(self) -> None:
if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
raise NotImplementedError(
"Fast prefill optimization for KV sharing is not supported "
"in V0 currently.")
@model_validator(mode='after') @model_validator(mode='after')
def _verify_args(self) -> Self: def _verify_args(self) -> Self:
if self.cpu_offload_gb < 0: if self.cpu_offload_gb < 0:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment