Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
938a8169
Unverified
Commit
938a8169
authored
Nov 04, 2025
by
Nick Hill
Committed by
GitHub
Nov 04, 2025
Browse files
[AsyncScheduling] Don't schedule past request max_tokens (#27922)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
c9f66da8
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
14 additions
and
4 deletions
+14
-4
tests/v1/core/test_async_scheduler.py
tests/v1/core/test_async_scheduler.py
+7
-0
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+0
-1
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+7
-3
No files found.
tests/v1/core/test_async_scheduler.py
View file @
938a8169
...
@@ -34,15 +34,20 @@ def test_stop_by_max_tokens(max_tokens: int):
...
@@ -34,15 +34,20 @@ def test_stop_by_max_tokens(max_tokens: int):
requests
=
create_requests
(
num_requests
=
2
,
max_tokens
=
max_tokens
)
requests
=
create_requests
(
num_requests
=
2
,
max_tokens
=
max_tokens
)
req0
,
req1
=
requests
req0
,
req1
=
requests
expected_total_num_scheduled_tokens
=
0
sched_outputs
:
deque
[
SchedulerOutput
]
=
deque
()
sched_outputs
:
deque
[
SchedulerOutput
]
=
deque
()
scheduler
.
add_request
(
req0
)
scheduler
.
add_request
(
req0
)
sched_outputs
.
append
(
scheduler
.
schedule
())
sched_outputs
.
append
(
scheduler
.
schedule
())
expected_total_num_scheduled_tokens
+=
req0
.
num_prompt_tokens
+
max_tokens
-
1
scheduler
.
add_request
(
req1
)
scheduler
.
add_request
(
req1
)
sched_outputs
.
append
(
scheduler
.
schedule
())
sched_outputs
.
append
(
scheduler
.
schedule
())
expected_total_num_scheduled_tokens
+=
req1
.
num_prompt_tokens
+
max_tokens
-
1
total_num_scheduled_tokens
=
0
while
sched_outputs
:
while
sched_outputs
:
sched_output
=
sched_outputs
.
popleft
()
sched_output
=
sched_outputs
.
popleft
()
total_num_scheduled_tokens
+=
sched_output
.
total_num_scheduled_tokens
model_runner_output
=
_make_model_runner_output
(
sched_output
)
model_runner_output
=
_make_model_runner_output
(
sched_output
)
scheduler
.
update_from_output
(
sched_output
,
model_runner_output
)
scheduler
.
update_from_output
(
sched_output
,
model_runner_output
)
...
@@ -53,6 +58,8 @@ def test_stop_by_max_tokens(max_tokens: int):
...
@@ -53,6 +58,8 @@ def test_stop_by_max_tokens(max_tokens: int):
assert
scheduler
.
get_num_unfinished_requests
()
==
0
assert
scheduler
.
get_num_unfinished_requests
()
==
0
assert
req0
.
num_output_tokens
==
max_tokens
assert
req0
.
num_output_tokens
==
max_tokens
assert
req1
.
num_output_tokens
==
max_tokens
assert
req1
.
num_output_tokens
==
max_tokens
# Ensure we aren't scheduling more tokens than necessary.
assert
total_num_scheduled_tokens
==
expected_total_num_scheduled_tokens
def
test_abort
():
def
test_abort
():
...
...
tests/v1/e2e/test_spec_decode.py
View file @
938a8169
...
@@ -155,7 +155,6 @@ def test_suffix_decoding_acceptance(
...
@@ -155,7 +155,6 @@ def test_suffix_decoding_acceptance(
)
)
# Run several times and check that the accepted tokens increase.
# Run several times and check that the accepted tokens increase.
spec_llm
.
chat
(
test_prompts
,
sampling_config
)
num_draft
=
[]
num_draft
=
[]
num_accept
=
[]
num_accept
=
[]
for
i
in
range
(
10
):
# Run multiple times to warm up the cache.
for
i
in
range
(
10
):
# Run multiple times to warm up the cache.
...
...
vllm/v1/core/sched/scheduler.py
View file @
938a8169
...
@@ -217,10 +217,14 @@ class Scheduler(SchedulerInterface):
...
@@ -217,10 +217,14 @@ class Scheduler(SchedulerInterface):
num_new_tokens
=
self
.
scheduler_config
.
long_prefill_token_threshold
num_new_tokens
=
self
.
scheduler_config
.
long_prefill_token_threshold
num_new_tokens
=
min
(
num_new_tokens
,
token_budget
)
num_new_tokens
=
min
(
num_new_tokens
,
token_budget
)
# Make sure the input position does not exceed the max model len.
# Make sure the input position does not exceed the max model len or
# This is necessary when using spec decoding.
# request's max_tokens.
# This is necessary when using spec decoding and/or async scheduling.
max_total_tokens
=
min
(
request
.
num_prompt_tokens
+
request
.
max_tokens
,
self
.
max_model_len
)
num_new_tokens
=
min
(
num_new_tokens
=
min
(
num_new_tokens
,
self
.
max_model_l
en
-
1
-
request
.
num_computed_tokens
num_new_tokens
,
max_total_tok
en
s
-
1
-
request
.
num_computed_tokens
)
)
# Schedule encoder inputs.
# Schedule encoder inputs.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment