Unverified Commit 67fc16cd authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Bugfix] If chunked_prefill is disabled, end the scheduling early. (#28911)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
parent 6330f947
...@@ -641,6 +641,34 @@ def test_schedule_concurrent_batches( ...@@ -641,6 +641,34 @@ def test_schedule_concurrent_batches(
scheduler.update_from_output(scheduler_output1, model_runner_output) scheduler.update_from_output(scheduler_output1, model_runner_output)
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
def test_schedule_order(enable_chunked_prefill: bool):
scheduler = create_scheduler(
max_num_batched_tokens=1024,
max_num_seqs=3,
enable_chunked_prefill=enable_chunked_prefill,
)
# long requests
requests = create_requests(num_requests=2, num_tokens=800)
# short requests
requests += create_requests(num_requests=2, num_tokens=10)
for request in requests:
scheduler.add_request(request)
scheduler_output1 = scheduler.schedule()
if enable_chunked_prefill:
# When enable chunked prefill, long requests will be chunked.
assert len(scheduler_output1.scheduled_new_reqs) == 2
else:
# When disable chunked prefill, should not skip the long requests,
# and scheduling subsequent short requests in advance,
# even though there is still token budgets remaining.
assert len(scheduler_output1.scheduled_new_reqs) == 1
def test_preempt_during_execution(): def test_preempt_during_execution():
# NOTE(woosuk): The actual number of available blocks is 10 instead of 11 # NOTE(woosuk): The actual number of available blocks is 10 instead of 11
# because block 0 is reserved as the null block. # because block 0 is reserved as the null block.
......
...@@ -42,6 +42,7 @@ def create_scheduler( ...@@ -42,6 +42,7 @@ def create_scheduler(
model: str = "facebook/opt-125m", model: str = "facebook/opt-125m",
max_num_seqs: int = 16, max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192, max_num_batched_tokens: int = 8192,
enable_chunked_prefill: bool = True,
enable_prefix_caching: bool = False, enable_prefix_caching: bool = False,
long_prefill_token_threshold: int = 0, long_prefill_token_threshold: int = 0,
disable_chunked_mm_input: bool = False, disable_chunked_mm_input: bool = False,
...@@ -76,7 +77,7 @@ def create_scheduler( ...@@ -76,7 +77,7 @@ def create_scheduler(
max_model_len=max_model_len, max_model_len=max_model_len,
long_prefill_token_threshold=long_prefill_token_threshold, long_prefill_token_threshold=long_prefill_token_threshold,
disable_chunked_mm_input=disable_chunked_mm_input, disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=True, enable_chunked_prefill=enable_chunked_prefill,
async_scheduling=async_scheduling, async_scheduling=async_scheduling,
) )
model_config = ModelConfig( model_config = ModelConfig(
......
...@@ -508,9 +508,9 @@ class Scheduler(SchedulerInterface): ...@@ -508,9 +508,9 @@ class Scheduler(SchedulerInterface):
not self.scheduler_config.enable_chunked_prefill not self.scheduler_config.enable_chunked_prefill
and num_new_tokens > token_budget and num_new_tokens > token_budget
): ):
self.waiting.pop_request() # If chunked_prefill is disabled,
skipped_waiting_requests.prepend_request(request) # we can stop the scheduling here.
continue break
num_new_tokens = min(num_new_tokens, token_budget) num_new_tokens = min(num_new_tokens, token_budget)
assert num_new_tokens > 0 assert num_new_tokens > 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment