Unverified Commit c5830381 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[V0 Deprecation] Remove args for multi-step scheduling (#22779)


Signed-off-by: default avatarWoosuk Kwon <woosuk@thinkingmachines.ai>
parent d31f97cf
...@@ -161,7 +161,6 @@ def parser_with_config(): ...@@ -161,7 +161,6 @@ def parser_with_config():
parser.add_argument('--port', type=int) parser.add_argument('--port', type=int)
parser.add_argument('--tensor-parallel-size', type=int) parser.add_argument('--tensor-parallel-size', type=int)
parser.add_argument('--trust-remote-code', action='store_true') parser.add_argument('--trust-remote-code', action='store_true')
parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean)
return parser return parser
......
...@@ -115,12 +115,6 @@ class SchedulerConfig: ...@@ -115,12 +115,6 @@ class SchedulerConfig:
(e.g., beam search), recomputation is not currently supported. In (e.g., beam search), recomputation is not currently supported. In
such a case, we use swapping instead.""" such a case, we use swapping instead."""
num_scheduler_steps: int = 1
"""Maximum number of forward steps per scheduler call."""
multi_step_stream_outputs: bool = True
"""If False, then multi-step will stream outputs at the end of all steps"""
send_delta_data: bool = False send_delta_data: bool = False
"""Private API. If used, scheduler sends delta data to """Private API. If used, scheduler sends delta data to
workers instead of an entire data. It should be enabled only workers instead of an entire data. It should be enabled only
...@@ -193,16 +187,7 @@ class SchedulerConfig: ...@@ -193,16 +187,7 @@ class SchedulerConfig:
if self.max_num_batched_tokens is None: if self.max_num_batched_tokens is None:
if self.enable_chunked_prefill: if self.enable_chunked_prefill:
if self.num_scheduler_steps > 1: self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
# Multi-step Chunked-Prefill doesn't allow prompt-chunking
# for now. Have max_num_batched_tokens set to max_model_len
# so we don't reject sequences on account of a short
# max_num_batched_tokens.
self.max_num_batched_tokens = max(
self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
else:
self.max_num_batched_tokens = (
DEFAULT_MAX_NUM_BATCHED_TOKENS)
else: else:
# If max_model_len is too short, use # If max_model_len is too short, use
# DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
...@@ -293,12 +278,6 @@ class SchedulerConfig: ...@@ -293,12 +278,6 @@ class SchedulerConfig:
f"({self.num_lookahead_slots}) must be greater than or " f"({self.num_lookahead_slots}) must be greater than or "
"equal to 0.") "equal to 0.")
if self.num_scheduler_steps < 1:
raise ValueError(
"num_scheduler_steps "
f"({self.num_scheduler_steps}) must be greater than or "
"equal to 1.")
if self.max_num_partial_prefills < 1: if self.max_num_partial_prefills < 1:
raise ValueError( raise ValueError(
f"max_num_partial_prefills ({self.max_num_partial_prefills}) " f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
...@@ -323,7 +302,3 @@ class SchedulerConfig: ...@@ -323,7 +302,3 @@ class SchedulerConfig:
f"max_num_partial_prefills ({self.max_num_partial_prefills}).") f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
return self return self
@property
def is_multi_step(self) -> bool:
return self.num_scheduler_steps > 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment