Unverified Commit 9e5bd307 authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[Cleanup] Remove no-longer-used `SpeculativeConfig.enable_chunked_prefill` (#27826)


Signed-off-by: default avatarNick Hill <nhill@redhat.com>
parent fc16f1c4
......@@ -78,10 +78,6 @@ class SpeculativeConfig:
draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
"""The degree of the tensor parallelism for the draft model. Can only be 1
or the same as the target model's tensor parallel size."""
disable_logprobs: bool = True
"""If set to True, token log probabilities are not returned during
speculative decoding. If set to False, token log probabilities are returned
according to the log probability settings in SamplingParams."""
# Draft model configuration
quantization: me_quant.QuantizationMethods | None = None
......@@ -126,12 +122,6 @@ class SpeculativeConfig:
"""The configuration of the target model."""
target_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore
"""The parallel configuration for the target model."""
enable_chunked_prefill: SkipValidation[bool] = None # type: ignore
"""Whether vLLM is configured to use chunked prefill or not. Used for
raising an error since it's not yet compatible with speculative decode."""
disable_log_stats: SkipValidation[bool] = None # type: ignore
"""Whether to disable the periodic printing of stage times in speculative
decoding."""
# params generated in the post-init stage
draft_model_config: SkipValidation[ModelConfig] = None # type: ignore
......
......@@ -1246,8 +1246,6 @@ class EngineArgs:
self,
target_model_config: ModelConfig,
target_parallel_config: ParallelConfig,
enable_chunked_prefill: bool,
disable_log_stats: bool,
) -> SpeculativeConfig | None:
"""Initializes and returns a SpeculativeConfig object based on
`speculative_config`.
......@@ -1267,8 +1265,6 @@ class EngineArgs:
{
"target_model_config": target_model_config,
"target_parallel_config": target_parallel_config,
"enable_chunked_prefill": enable_chunked_prefill,
"disable_log_stats": disable_log_stats,
}
)
return SpeculativeConfig(**self.speculative_config)
......@@ -1561,8 +1557,6 @@ class EngineArgs:
speculative_config = self.create_speculative_config(
target_model_config=model_config,
target_parallel_config=parallel_config,
enable_chunked_prefill=self.enable_chunked_prefill,
disable_log_stats=self.disable_log_stats,
)
# make sure num_lookahead_slots is set appropriately depending on
......
......@@ -241,6 +241,7 @@ async def build_async_engine_client_from_engine_args(
)
# Don't keep the dummy data in memory
assert async_llm is not None
await async_llm.reset_mm_cache()
yield async_llm
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment