Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
02859973
Unverified
Commit
02859973
authored
Jan 10, 2026
by
Or Ozeri
Committed by
GitHub
Jan 10, 2026
Browse files
[BugFix] scheduler: Fix resuming of preempted requests after async load (#31583)
Signed-off-by:
Or Ozeri
<
oro@il.ibm.com
>
parent
d1fd802f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
28 additions
and
4 deletions
+28
-4
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+21
-2
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+6
-1
vllm/v1/request.py
vllm/v1/request.py
+1
-1
No files found.
tests/v1/core/test_scheduler.py
View file @
02859973
...
...
@@ -1261,10 +1261,11 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
assert
len
(
scheduler
.
waiting
)
==
0
@
pytest
.
mark
.
parametrize
(
"is_async"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"use_ec_connector, ec_role"
,
[(
False
,
None
),
(
True
,
"ec_consumer"
)]
)
def
test_kv_connector_handles_preemption
(
use_ec_connector
,
ec_role
):
def
test_kv_connector_handles_preemption
(
is_async
,
use_ec_connector
,
ec_role
):
"""
Test whether scheduler with KVConnector is able to handle
unable to allocate (run out of blocks in allocate_slots().
...
...
@@ -1277,7 +1278,9 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
NUM_MATCHED_NEW_TOKENS
=
BLOCK_SIZE
scheduler
=
create_scheduler
(
enable_prefix_caching
=
True
,
use_kv_connector
=
mock_kv
(
matched_tokens
=
NUM_MATCHED_NEW_TOKENS
,
is_async
=
False
),
use_kv_connector
=
mock_kv
(
matched_tokens
=
NUM_MATCHED_NEW_TOKENS
,
is_async
=
is_async
),
block_size
=
BLOCK_SIZE
,
num_blocks
=
NUM_BLOCKS
,
# encoder connector should not affect test results
...
...
@@ -1315,6 +1318,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
# All can be scheduled - 1st token.
output
=
scheduler
.
schedule
()
if
is_async
:
assert
len
(
scheduler
.
waiting
)
==
2
assert
scheduler
.
running
==
[]
_step_until_kv_transfer_finished
(
scheduler
,
req_ids
)
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# 2 remote kv cache hits.
...
...
@@ -1367,6 +1376,12 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
# Restarts the preempted request - generate 3rd token.
# This will have a local and remote cache hit.
output
=
scheduler
.
schedule
()
if
is_async
:
waiting_req_ids
=
[
req
.
request_id
for
req
in
scheduler
.
waiting
]
assert
len
(
waiting_req_ids
)
==
1
_step_until_kv_transfer_finished
(
scheduler
,
waiting_req_ids
)
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# 1 remote kv_cache hit!
...
...
@@ -1377,6 +1392,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
0
assert
output
.
scheduled_cached_reqs
.
num_reqs
==
1
assert
output
.
scheduled_new_reqs
==
[]
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
0
...
...
@@ -1389,6 +1406,8 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
num_requests
=
0
,
expected_num_scheduled_tokens
=
1
,
)
assert
output
.
scheduled_cached_reqs
.
num_reqs
==
1
assert
output
.
scheduled_new_reqs
==
[]
assert
len
(
scheduler
.
running
)
==
1
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
0
...
...
vllm/v1/core/sched/scheduler.py
View file @
02859973
...
...
@@ -445,7 +445,12 @@ class Scheduler(SchedulerInterface):
if
request
.
status
==
RequestStatus
.
WAITING_FOR_REMOTE_KVS
:
is_ready
=
self
.
_update_waiting_for_remote_kv
(
request
)
if
is_ready
:
request
.
status
=
RequestStatus
.
WAITING
if
request
.
num_preemptions
:
# We must be loading for a resumed preemption
# rather than a new request.
request
.
status
=
RequestStatus
.
PREEMPTED
else
:
request
.
status
=
RequestStatus
.
WAITING
else
:
logger
.
debug
(
"%s is still in WAITING_FOR_REMOTE_KVS state."
,
...
...
vllm/v1/request.py
View file @
02859973
...
...
@@ -123,7 +123,7 @@ class Request:
# indicates that the output is corrupted
self
.
num_nans_in_logits
=
0
# The number of requests be
ing
preempted by the scheduler
# The number of
times this
request
ha
s be
en
preempted by the scheduler
.
self
.
num_preemptions
=
0
# The number of tokens that have been computed remotely.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment