Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ad477f82
Commit
ad477f82
authored
Sep 24, 2025
by
zhuwenwen
Browse files
fix scheduler issu in pp + mtp
parent
1e636721
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
5 additions
and
4 deletions
+5
-4
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+4
-4
vllm/v1/request.py
vllm/v1/request.py
+1
-0
No files found.
vllm/v1/core/sched/scheduler.py
View file @
ad477f82
...
...
@@ -1099,16 +1099,14 @@ class Scheduler(SchedulerInterface):
for
req
in
itertools
.
chain
(
running_reqs
,
resumed_reqs
):
req_id
=
req
.
request_id
req_ids
.
append
(
req_id
)
num_tokens
=
(
num_scheduled_tokens
[
req_id
]
-
len
(
spec_decode_tokens
.
get
(
req_id
,
())))
num_tokens
=
req
.
num_generated_token_ids
if
self
.
use_pp
:
# When using PP, the scheduler sends the sampled tokens back,
# because there's no direct communication between the first-
# stage worker and the last-stage worker. Otherwise, we don't
# need to send the sampled tokens back because the model runner
# will cache them.
token_ids
=
req
.
all_token_ids
[
req
.
num_computed_tokens
:
req
.
num_computed_tokens
+
num_tokens
]
token_ids
=
req
.
all_token_ids
[
-
num_tokens
:]
new_token_ids
.
append
(
token_ids
)
elif
use_connector
:
# When using a KVConnector, we add a placeholder to avoid index
...
...
@@ -1318,6 +1316,7 @@ class Scheduler(SchedulerInterface):
scheduled_spec_token_ids
=
(
scheduler_output
.
scheduled_spec_decode_tokens
.
get
(
req_id
))
request
.
num_generated_token_ids
=
1
if
scheduled_spec_token_ids
:
num_draft_tokens
=
len
(
scheduled_spec_token_ids
)
num_accepted
=
len
(
generated_token_ids
)
-
1
...
...
@@ -1328,6 +1327,7 @@ class Scheduler(SchedulerInterface):
# num_computed_tokens is decreased by the number of rejected
# tokens.
request
.
num_computed_tokens
-=
num_rejected
request
.
num_generated_token_ids
=
len
(
generated_token_ids
)
spec_decoding_stats
=
self
.
make_spec_decoding_stats
(
spec_decoding_stats
,
num_draft_tokens
=
num_draft_tokens
,
...
...
vllm/v1/request.py
View file @
ad477f82
...
...
@@ -85,6 +85,7 @@ class Request:
self
.
num_output_placeholders
=
0
# Used in async scheduling.
self
.
spec_token_ids
:
list
[
int
]
=
[]
self
.
num_computed_tokens
=
0
self
.
num_generated_token_ids
=
0
self
.
cache_salt
:
Optional
[
str
]
=
cache_salt
# Multi-modal related
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment