Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
55f7b089
Commit
55f7b089
authored
Nov 03, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev-ds' of
http://10.16.6.30/dcutoolkit/deeplearing/vllm
into v0.9.2-dev-ds
parents
5ca1259e
ab485158
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
33 additions
and
18 deletions
+33
-18
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+27
-12
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+5
-2
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-4
No files found.
vllm/model_executor/models/deepseek_v2.py
View file @
55f7b089
...
@@ -43,8 +43,8 @@ from vllm.distributed import (get_ep_group, get_pp_group, get_dp_group,
...
@@ -43,8 +43,8 @@ from vllm.distributed import (get_ep_group, get_pp_group, get_dp_group,
get_tensor_model_parallel_world_size
)
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.fused_moe.
ep
_moe.layer
import
EP
MoE
from
vllm.model_executor.layers.fused_moe.
mori
_moe.layer
import
Mori
MoE
from
vllm.model_executor.layers.fused_moe.
ep
_moe.ep_moe_utlis
import
EPSharedExperts
from
vllm.model_executor.layers.fused_moe.
mori
_moe.ep_moe_utlis
import
EPSharedExperts
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -167,9 +167,10 @@ class DeepseekV2MoE(nn.Module):
...
@@ -167,9 +167,10 @@ class DeepseekV2MoE(nn.Module):
self
.
n_local_physical_experts
)
self
.
n_local_physical_experts
)
dp_size
=
get_dp_group
().
world_size
dp_size
=
get_dp_group
().
world_size
self
.
use_mori_ep
=
envs
.
VLLM_USE_MORI_EP
and
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
self
.
use_mori_ep
=
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
envs
.
VLLM_ALL2ALL_BACKEND
==
'mori'
self
.
enable_expert_parallel
=
parallel_config
.
enable_expert_parallel
moe_cls
=
FusedMoE
if
not
self
.
use_mori_ep
else
EP
MoE
moe_cls
=
FusedMoE
if
not
self
.
use_mori_ep
else
Mori
MoE
self
.
experts
=
moe_cls
(
self
.
experts
=
moe_cls
(
num_experts
=
config
.
n_routed_experts
,
num_experts
=
config
.
n_routed_experts
,
top_k
=
config
.
num_experts_per_tok
,
top_k
=
config
.
num_experts_per_tok
,
...
@@ -224,12 +225,12 @@ class DeepseekV2MoE(nn.Module):
...
@@ -224,12 +225,12 @@ class DeepseekV2MoE(nn.Module):
# router_logits: (num_tokens, n_experts)
# router_logits: (num_tokens, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
if
not
self
.
use_mori_ep
:
if
not
self
.
enable_expert_parallel
:
if
envs
.
VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
:
if
envs
.
VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
:
final_hidden_states
=
self
.
experts
(
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
hidden_states
=
hidden_states
,
router_logits
=
router_logits
,
router_logits
=
router_logits
,
shared_output
=
shared_output
)
shared_output
=
shared_output
)
else
:
else
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
final_hidden_states
=
self
.
experts
(
final_hidden_states
=
self
.
experts
(
...
@@ -248,8 +249,22 @@ class DeepseekV2MoE(nn.Module):
...
@@ -248,8 +249,22 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details.
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
*
(
1.
/
self
.
routed_scaling_factor
)
else
:
else
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
if
not
self
.
use_mori_ep
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
final_hidden_states
=
final_hidden_states
+
shared_output
else
:
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
else
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
router_logits
=
router_logits
)
if
not
self
.
use_mori_ep
:
if
not
self
.
use_mori_ep
:
...
@@ -927,7 +942,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
...
@@ -927,7 +942,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
parallel_config
=
vllm_config
.
parallel_config
parallel_config
=
vllm_config
.
parallel_config
dp_size
=
get_dp_group
().
world_size
dp_size
=
get_dp_group
().
world_size
self
.
use_mori_ep
=
envs
.
VLLM_
USE_MORI_EP
and
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
self
.
use_mori_ep
=
envs
.
VLLM_
ALL2ALL_BACKEND
==
'mori'
and
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
def
set_eplb_state
(
def
set_eplb_state
(
self
,
self
,
...
@@ -1173,4 +1188,4 @@ def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
...
@@ -1173,4 +1188,4 @@ def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
for
i
in
range
(
config
.
num_nextn_predict_layers
):
for
i
in
range
(
config
.
num_nextn_predict_layers
):
if
weight_name
.
startswith
(
f
"model.layers.
{
layer_idx
+
i
}
."
):
if
weight_name
.
startswith
(
f
"model.layers.
{
layer_idx
+
i
}
."
):
return
layer_idx
+
i
return
layer_idx
+
i
return
None
return
None
\ No newline at end of file
vllm/v1/spec_decode/eagle.py
View file @
55f7b089
...
@@ -513,8 +513,11 @@ class EagleProposer:
...
@@ -513,8 +513,11 @@ class EagleProposer:
self
.
hidden_states
[:
num_tokens
],
self
.
hidden_states
[:
num_tokens
],
)
)
if
self
.
dp_size
>
1
and
self
.
enable_expert_parallel
and
self
.
num_speculative_tokens
>
1
:
if
self
.
dp_size
>
1
and
self
.
enable_expert_parallel
and
self
.
num_speculative_tokens
>
1
:
for
_
in
range
(
self
.
num_speculative_tokens
-
1
):
for
_
in
range
(
self
.
num_speculative_tokens
-
1
):
with
set_forward_context
(
attn_metadata
,
self
.
vllm_config
,
num_tokens
=
num_tokens
):
self
.
model
(
self
.
model
(
self
.
input_ids
[:
num_tokens
],
self
.
input_ids
[:
num_tokens
],
self
.
positions
[:
num_tokens
],
self
.
positions
[:
num_tokens
],
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
55f7b089
...
@@ -323,9 +323,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -323,9 +323,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# from the KV cache of `shared_kv_cache_layers[layer_name]`.
# from the KV cache of `shared_kv_cache_layers[layer_name]`.
self
.
shared_kv_cache_layers
:
dict
[
str
,
str
]
=
{}
self
.
shared_kv_cache_layers
:
dict
[
str
,
str
]
=
{}
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
self
.
use_mori_ep
=
envs
.
VLLM_USE_MORI_EP
and
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
def
_may_reorder_batch
(
self
,
scheduler_output
:
"SchedulerOutput"
)
->
None
:
def
_may_reorder_batch
(
self
,
scheduler_output
:
"SchedulerOutput"
)
->
None
:
"""
"""
Update the order of requests in the batch based on the attention
Update the order of requests in the batch based on the attention
...
@@ -1238,7 +1235,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -1238,7 +1235,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# TODO(tms) : There are many cases where padding is enabled for
# TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations.
# prefills, causing unnecessary and excessive padding of activations.
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
or
self
.
use_mori_ep
:
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
or
envs
.
VLLM_ALL2ALL_BACKEND
==
'naive'
:
# Early exit.
# Early exit.
return
0
,
None
return
0
,
None
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment