Unverified Commit 20e4497b authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[V0 Deprecation] Remove `num_lookahead_slots` (#29000)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: default avatarMichael Goin <mgoin64@gmail.com>
parent 1c7bcc55
......@@ -62,15 +62,6 @@ class SchedulerConfig:
"""For chunked prefill, a request is considered long if the prompt is
longer than this number of tokens."""
num_lookahead_slots: int = Field(default=0, ge=0)
"""The number of slots to allocate per sequence per
step, beyond the known token ids. This is used in speculative
decoding to store KV activations of tokens which may or may not be
accepted.
NOTE: This will be replaced by speculative config in the future; it is
present to enable correctness tests until then."""
enable_chunked_prefill: bool = True
"""If True, prefill requests can be chunked based
on the remaining `max_num_batched_tokens`.
......
......@@ -634,16 +634,6 @@ class SpeculativeConfig:
return self
@property
def num_lookahead_slots(self) -> int:
"""The number of additional slots the scheduler should allocate per
step, in addition to the slots allocated for each known token.
This is equal to the number of speculative tokens, as each speculative
token must be scored.
"""
return self.num_speculative_tokens
def use_eagle(self) -> bool:
return self.method in ("eagle", "eagle3", "mtp")
......
......@@ -488,7 +488,6 @@ class EngineArgs:
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
......@@ -1081,9 +1080,6 @@ class EngineArgs:
"--long-prefill-token-threshold",
**scheduler_kwargs["long_prefill_token_threshold"],
)
scheduler_group.add_argument(
"--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
)
# multi-step scheduling has been removed; corresponding arguments
# are no longer supported.
scheduler_group.add_argument(
......@@ -1653,18 +1649,11 @@ class EngineArgs:
target_parallel_config=parallel_config,
)
# make sure num_lookahead_slots is set appropriately depending on
# whether speculative decoding is enabled
num_lookahead_slots = self.num_lookahead_slots
if speculative_config is not None:
num_lookahead_slots = speculative_config.num_lookahead_slots
scheduler_config = SchedulerConfig(
runner_type=model_config.runner_type,
max_num_batched_tokens=self.max_num_batched_tokens,
max_num_seqs=self.max_num_seqs,
max_model_len=model_config.max_model_len,
num_lookahead_slots=num_lookahead_slots,
enable_chunked_prefill=self.enable_chunked_prefill,
disable_chunked_mm_input=self.disable_chunked_mm_input,
is_multimodal_model=model_config.is_multimodal_model,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment