Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6b7cdbf4
Commit
6b7cdbf4
authored
May 09, 2026
by
王敏
Browse files
优化pp+mtp代码
parent
19f117d8
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
19 additions
and
25 deletions
+19
-25
vllm/model_executor/models/deepseek_mtp.py
vllm/model_executor/models/deepseek_mtp.py
+3
-2
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+1
-2
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+15
-21
No files found.
vllm/model_executor/models/deepseek_mtp.py
View file @
6b7cdbf4
...
@@ -266,6 +266,7 @@ class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts, SupportsPP):
...
@@ -266,6 +266,7 @@ class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts, SupportsPP):
os
.
environ
[
'LLAMA_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LM_NN'
]
=
'0'
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
self
.
use_pp
=
vllm_config
.
parallel_config
.
pipeline_parallel_size
>
1
def
set_moe_parameters
(
self
):
def
set_moe_parameters
(
self
):
...
@@ -351,8 +352,8 @@ class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts, SupportsPP):
...
@@ -351,8 +352,8 @@ class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts, SupportsPP):
spec_layer
=
get_spec_layer_idx_from_weight_name
(
self
.
config
,
name
)
spec_layer
=
get_spec_layer_idx_from_weight_name
(
self
.
config
,
name
)
if
spec_layer
is
None
:
if
spec_layer
is
None
:
# load embed_tokens weight from target model i
f mtp weights missing embed_tokens
# load embed_tokens weight from target model i
n pp mode
if
"embed_tokens"
in
name
:
if
"embed_tokens"
in
name
and
self
.
use_pp
:
for
local_name
in
params_dict
.
keys
():
for
local_name
in
params_dict
.
keys
():
if
"embed_tokens"
in
local_name
:
if
"embed_tokens"
in
local_name
:
param
=
params_dict
[
local_name
]
param
=
params_dict
[
local_name
]
...
...
vllm/v1/core/sched/scheduler.py
View file @
6b7cdbf4
...
@@ -1657,11 +1657,10 @@ class Scheduler(SchedulerInterface):
...
@@ -1657,11 +1657,10 @@ class Scheduler(SchedulerInterface):
for
idx
,
req
in
enumerate
(
itertools
.
chain
(
running_reqs
,
resumed_reqs
)):
for
idx
,
req
in
enumerate
(
itertools
.
chain
(
running_reqs
,
resumed_reqs
)):
req_id
=
req
.
request_id
req_id
=
req
.
request_id
req_ids
.
append
(
req_id
)
req_ids
.
append
(
req_id
)
#if self.use_pp:
# NOTE: In PP+async scheduling, we consume token ids via a direct GPU
# NOTE: In PP+async scheduling, we consume token ids via a direct GPU
# broadcast path (`input_batch.prev_sampled_token_ids`), so we can
# broadcast path (`input_batch.prev_sampled_token_ids`), so we can
# omit this payload.
# omit this payload.
if
self
.
use_pp
and
not
self
.
scheduler_config
.
async_scheduling
:
if
self
.
use_pp
:
# When using PP, the scheduler sends the sampled tokens back,
# When using PP, the scheduler sends the sampled tokens back,
# because there's no direct communication between the first-
# because there's no direct communication between the first-
# stage worker and the last-stage worker. Otherwise, we don't
# stage worker and the last-stage worker. Otherwise, we don't
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
6b7cdbf4
...
@@ -931,7 +931,7 @@ class GPUModelRunner(
...
@@ -931,7 +931,7 @@ class GPUModelRunner(
The SamplingMetadata is updated and copied to the GPU if there is a
The SamplingMetadata is updated and copied to the GPU if there is a
new/resumed/paused/finished request in the batch.
new/resumed/paused/finished request in the batch.
"""
"""
if
scheduler_output
.
total_num_scheduled_tokens
==
0
:
if
scheduler_output
.
total_num_scheduled_tokens
==
0
and
not
self
.
use_async_scheduling
:
return
return
# Remove finished requests from the cached states.
# Remove finished requests from the cached states.
for
req_id
in
scheduler_output
.
finished_req_ids
:
for
req_id
in
scheduler_output
.
finished_req_ids
:
...
@@ -1092,26 +1092,20 @@ class GPUModelRunner(
...
@@ -1092,26 +1092,20 @@ class GPUModelRunner(
req_state
.
num_computed_tokens
=
num_computed_tokens
req_state
.
num_computed_tokens
=
num_computed_tokens
if
not
is_last_rank
:
if
not
is_last_rank
:
if
not
req_data
.
new_token_ids
:
# When using PP, the scheduler sends the sampled tokens back,
# Async scheduled PP: Sampled tokens propagated via GPU broadcast.
# because there's no direct communication between the first-
new_token_ids
:
list
[
int
]
=
[]
# stage worker and the last-stage worker.
else
:
new_token_ids
=
req_data
.
new_token_ids
[
i
]
# Non-async scheduling with PP: The scheduler sends
# Add the sampled token(s) from the previous step (if any).
# sampled token ids back because there's no direct communication
# This doesn't include "unverified" tokens like spec tokens.
# between the first-stage worker and the last-stage worker.
num_new_tokens
=
(
new_token_ids
=
req_data
.
new_token_ids
[
i
]
num_computed_tokens
+
len
(
new_token_ids
)
-
req_state
.
num_tokens
# Add the sampled token(s) from the previous step (if any).
)
# This doesn't include "unverified" tokens like spec tokens.
if
num_new_tokens
==
1
:
num_new_tokens
=
(
# Avoid slicing list in most common case.
num_computed_tokens
+
len
(
new_token_ids
)
-
req_state
.
num_tokens
req_state
.
output_token_ids
.
append
(
new_token_ids
[
-
1
])
)
elif
num_new_tokens
>
0
:
if
num_new_tokens
==
1
:
req_state
.
output_token_ids
.
extend
(
new_token_ids
[
-
num_new_tokens
:])
# Avoid slicing list in most common case.
req_state
.
output_token_ids
.
append
(
new_token_ids
[
-
1
])
elif
num_new_tokens
>
0
:
req_state
.
output_token_ids
.
extend
(
new_token_ids
[
-
num_new_tokens
:]
)
elif
num_output_tokens
<
len
(
req_state
.
output_token_ids
):
elif
num_output_tokens
<
len
(
req_state
.
output_token_ids
):
# Some output tokens were discarded due to a sync-KV-load
# Some output tokens were discarded due to a sync-KV-load
# failure. Align the cached state.
# failure. Align the cached state.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment