Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
748625cd
Unverified
Commit
748625cd
authored
Feb 10, 2026
by
Krish Gupta
Committed by
GitHub
Feb 10, 2026
Browse files
[V1][BugFix] Fix EAGLE3 encoder cache miss with disable_chunked_mm_input (#34220)
Signed-off-by:
KrxGu
<
krishom70@gmail.com
>
parent
61413973
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
75 additions
and
1 deletion
+75
-1
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+69
-0
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+6
-1
No files found.
tests/v1/core/test_scheduler.py
View file @
748625cd
...
@@ -3675,3 +3675,72 @@ def test_abort_request_finished_recving():
...
@@ -3675,3 +3675,72 @@ def test_abort_request_finished_recving():
# verify request is deleted
# verify request is deleted
assert
request
.
request_id
not
in
scheduler
.
requests
assert
request
.
request_id
not
in
scheduler
.
requests
assert
not
scheduler
.
finished_recving_kv_req_ids
assert
not
scheduler
.
finished_recving_kv_req_ids
def
test_eagle3_mm_encoder_cache_with_shift
():
"""Test EAGLE3 encoder scheduling accounts for shift_computed_tokens.
Regression test for issue #32469: When EAGLE3 is enabled with
disable_chunked_mm_input=True, ensure encoder inputs are scheduled
when tokens overlap the MM range, properly accounting for
shift_computed_tokens in the boundary calculation.
Without the fix, the scheduler would fail to schedule encoder inputs
at the boundary, causing "Encoder cache miss" errors.
"""
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_num_batched_tokens
=
1024
,
disable_chunked_mm_input
=
True
,
max_model_len
=
2048
,
num_speculative_tokens
=
4
,
# This enables EAGLE with shift=1
)
mm_start_pos
=
100
mm_length
=
576
mm_positions
=
[
[
PlaceholderRange
(
offset
=
mm_start_pos
,
length
=
mm_length
)],
]
requests
=
create_requests
(
num_requests
=
1
,
num_tokens
=
mm_start_pos
+
mm_length
+
100
,
mm_positions
=
mm_positions
,
)
# Start with some tokens already computed to simulate decoding
request
=
requests
[
0
]
request
.
num_computed_tokens
=
0
scheduler
.
add_request
(
request
)
output
=
scheduler
.
schedule
()
assert
output
is
not
None
shift_computed_tokens
=
1
req_id
=
request
.
request_id
assert
req_id
in
output
.
num_scheduled_tokens
num_scheduled
=
output
.
num_scheduled_tokens
[
req_id
]
mm_feature
=
request
.
mm_features
[
0
]
start_pos
=
mm_feature
.
mm_position
.
offset
tokens_end
=
request
.
num_computed_tokens
+
num_scheduled
scheduled_end_with_shift
=
tokens_end
+
shift_computed_tokens
# Assert that we scheduled into the MM range (test setup verification)
assert
scheduled_end_with_shift
>
start_pos
,
(
f
"Test setup error: expected to schedule into MM range. "
f
"scheduled_end_with_shift=
{
scheduled_end_with_shift
}
, "
f
"start_pos=
{
start_pos
}
"
)
# The key assertion: when scheduled tokens overlap MM range
# (accounting for EAGLE's shift), encoder MUST be scheduled.
# Without the fix, this would fail at the boundary case.
assert
req_id
in
output
.
scheduled_encoder_inputs
,
(
f
"Encoder input missing: scheduled
{
num_scheduled
}
tokens "
f
"(computed=
{
request
.
num_computed_tokens
}
, end=
{
tokens_end
}
, "
f
"shifted_end=
{
scheduled_end_with_shift
}
) overlapping MM at "
f
"
{
start_pos
}
. The fix must schedule encoder inputs."
)
vllm/v1/core/sched/scheduler.py
View file @
748625cd
...
@@ -1155,7 +1155,12 @@ class Scheduler(SchedulerInterface):
...
@@ -1155,7 +1155,12 @@ class Scheduler(SchedulerInterface):
and
(
num_computed_tokens
+
num_new_tokens
)
and
(
num_computed_tokens
+
num_new_tokens
)
<
(
start_pos
+
num_encoder_tokens
)
<
(
start_pos
+
num_encoder_tokens
)
):
):
num_new_tokens
=
start_pos
-
num_computed_tokens
# Account for EAGLE shift when rolling back to avoid
# encoder cache miss. This ensures the scheduled range
# stops before start_pos even with the shift.
num_new_tokens
=
max
(
0
,
start_pos
-
(
num_computed_tokens
+
shift_computed_tokens
)
)
break
break
if
not
self
.
encoder_cache_manager
.
can_allocate
(
if
not
self
.
encoder_cache_manager
.
can_allocate
(
request
,
i
,
encoder_compute_budget
,
num_embeds_to_schedule
request
,
i
,
encoder_compute_budget
,
num_embeds_to_schedule
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment