Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
47604137
Unverified
Commit
47604137
authored
Nov 08, 2025
by
Andy Lo
Committed by
GitHub
Nov 08, 2025
Browse files
[Bugfix] Spec decode + structured output + spec model max len edge case (#28298)
Signed-off-by:
Andy Lo
<
andy@mistral.ai
>
parent
26990d25
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
36 additions
and
8 deletions
+36
-8
tests/v1/spec_decode/test_max_len.py
tests/v1/spec_decode/test_max_len.py
+30
-3
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+4
-4
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/__init__.py
+2
-1
No files found.
tests/v1/spec_decode/test_max_len.py
View file @
47604137
...
...
@@ -7,6 +7,7 @@ import pytest
from
tests.utils
import
get_attn_backend_list_based_on_platform
from
vllm
import
LLM
,
SamplingParams
from
vllm.platforms
import
current_platform
from
vllm.sampling_params
import
StructuredOutputsParams
_PROMPTS
=
[
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
,
...
...
@@ -56,8 +57,34 @@ def test_eagle_max_len(
"method"
:
"eagle"
,
"model"
:
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
"num_speculative_tokens"
:
num_speculative_tokens
,
"max_model_len"
:
80
,
},
max_model_len
=
1
00
,
max_model_len
=
2
00
,
)
sampling_params
=
SamplingParams
(
max_tokens
=
100
,
ignore_eos
=
True
)
llm
.
generate
(
_PROMPTS
,
sampling_params
)
sampling_params
=
SamplingParams
(
max_tokens
=
200
,
ignore_eos
=
True
)
outputs
=
llm
.
generate
(
_PROMPTS
,
sampling_params
)
for
o
in
outputs
:
assert
o
.
outputs
[
0
].
finish_reason
==
"length"
,
(
"This test is only meaningful if the output "
"is truncated due to max length"
)
sampling_params
=
SamplingParams
(
max_tokens
=
200
,
structured_outputs
=
StructuredOutputsParams
(
regex
=
"^"
+
"a b c d e "
*
15
+
"$"
),
)
output
=
llm
.
generate
(
_PROMPTS
,
sampling_params
)
for
o
in
output
:
assert
o
.
prompt_token_ids
is
not
None
assert
(
len
(
o
.
prompt_token_ids
)
<
80
<
len
(
o
.
prompt_token_ids
)
+
len
(
o
.
outputs
[
0
].
token_ids
)
<
200
),
(
"This test is only meaningful if the output "
"is longer than the eagle max length"
)
assert
o
.
outputs
[
0
].
text
==
"a b c d e "
*
15
vllm/v1/core/sched/scheduler.py
View file @
47604137
...
...
@@ -325,6 +325,9 @@ class Scheduler(SchedulerInterface):
scheduled_spec_decode_tokens
[
request
.
request_id
]
=
(
request
.
spec_token_ids
)
# New spec tokens will be set in `update_draft_token_ids` before the
# next step when applicable.
request
.
spec_token_ids
=
[]
# Encoder-related.
if
encoder_inputs_to_schedule
:
...
...
@@ -1149,10 +1152,7 @@ class Scheduler(SchedulerInterface):
continue
# Add newly generated spec token ids to the request.
if
not
spec_token_ids
:
# NOTE(woosuk): request.spec_token_ids should be updated.
request
.
spec_token_ids
.
clear
()
elif
self
.
structured_output_manager
.
should_advance
(
request
):
if
self
.
structured_output_manager
.
should_advance
(
request
):
metadata
=
request
.
structured_output_request
request
.
spec_token_ids
=
metadata
.
grammar
.
validate_tokens
(
# type: ignore[union-attr]
spec_token_ids
...
...
vllm/v1/structured_output/__init__.py
View file @
47604137
...
...
@@ -269,9 +269,10 @@ class StructuredOutputManager:
and
token
is
not
None
and
not
structured_output_request
.
grammar
.
is_terminated
()
):
a
ssert
structured_output_request
.
grammar
.
accept_tokens
(
a
ccepted
=
structured_output_request
.
grammar
.
accept_tokens
(
req_id
,
[
token
]
)
assert
accepted
,
(
token
,
req_id
,
scheduled_spec_decode_tokens
)
state_advancements
+=
1
cumulative_index
+=
1
if
state_advancements
>
0
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment