Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6cabbf16
Commit
6cabbf16
authored
Dec 08, 2025
by
王敏
Browse files
[feat]支持deepep ETP,dp4 tp4 ep16相比dp32 tp1 ep32提升明显
parent
ba1999c2
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
84 additions
and
21 deletions
+84
-21
vllm/model_executor/layers/fused_moe/mori_moe/ep_moe_utlis.py
.../model_executor/layers/fused_moe/mori_moe/ep_moe_utlis.py
+15
-0
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+8
-0
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
...ation/compressed_tensors/compressed_tensors_moe_marlin.py
+2
-1
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+50
-20
vllm/model_executor/parameter.py
vllm/model_executor/parameter.py
+9
-0
No files found.
vllm/model_executor/layers/fused_moe/mori_moe/ep_moe_utlis.py
View file @
6cabbf16
...
...
@@ -90,6 +90,8 @@ class EPSharedExperts(nn.Module):
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.gate_up_proj"
,
expect_tp_size
=
1
)
print
(
"#########self.gate_up_proj quant_method:"
,
self
.
gate_up_proj
.
quant_method
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
...
...
@@ -97,6 +99,19 @@ class EPSharedExperts(nn.Module):
reduce_results
=
reduce_results
,
prefix
=
f
"
{
prefix
}
.down_proj"
,
expect_tp_size
=
1
)
print
(
"#########self.down_proj quant_method:"
,
self
.
down_proj
.
quant_method
)
# self.gate_up_proj = MergedColumnParallelLinear(
# hidden_size, [intermediate_size] * 2,
# bias=False,
# quant_config=quant_config,
# prefix=f"{prefix}.gate_up_proj",
# expect_tp_size=1)
# self.down_proj = ReplicatedLinear(intermediate_size,
# hidden_size,
# bias=False,
# quant_config=quant_config,
# prefix=f"{prefix}.down_proj")
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
...
...
vllm/model_executor/layers/linear.py
View file @
6cabbf16
...
...
@@ -783,6 +783,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
return
if
is_gguf_weight
:
print
(
"############is_gguf_weight"
)
tp_size
=
get_tensor_model_parallel_world_size
()
tp_rank
=
get_tensor_model_parallel_rank
()
...
...
@@ -978,6 +979,11 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
tp_size
=
get_tensor_model_parallel_world_size
()
if
self
.
expect_tp_size
is
not
None
and
self
.
expect_tp_size
==
1
:
tp_size
=
1
if
hasattr
(
param
,
"expect_tp_size"
):
param
.
expect_tp_size
=
self
.
expect_tp_size
if
isinstance
(
param
,
BlockQuantScaleParameter
):
from
vllm.model_executor.layers.quantization.fp8
import
(
Fp8LinearMethod
,
Fp8MoEMethod
)
...
...
@@ -1519,6 +1525,8 @@ class RowParallelLinear(LinearBase):
assert
loaded_weight
.
numel
()
==
1
loaded_weight
=
loaded_weight
.
reshape
(
1
)
if
self
.
expect_tp_size
is
not
None
and
hasattr
(
param
,
"expect_tp_size"
):
param
.
expect_tp_size
=
self
.
expect_tp_size
param
.
load_row_parallel_weight
(
loaded_weight
=
loaded_weight
)
def
forward
(
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
View file @
6cabbf16
...
...
@@ -83,6 +83,7 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
vllm_config
=
get_current_vllm_config
()
parallel_config
=
vllm_config
.
parallel_config
self
.
dp_size
=
get_dp_group
().
world_size
self
.
ep_size
=
get_ep_group
().
world_size
self
.
use_deepep
=
self
.
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
and
\
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
or
\
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_low_latency"
)
...
...
@@ -241,7 +242,7 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
#expected_m = max_num_tokens
ori_bs
=
x
.
shape
[
0
]
expected_m
=
ori_bs
*
self
.
d
p_size
expected_m
=
ori_bs
*
self
.
e
p_size
# expected_m = (
# x.shape[0] * self.dp_size * topk_ids.shape[1]
# + global_num_experts
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
6cabbf16
...
...
@@ -40,7 +40,11 @@ from vllm.compilation.decorators import support_torch_compile
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
VllmConfig
,
get_current_vllm_config
)
from
vllm.distributed
import
(
get_ep_group
,
get_pp_group
,
get_dp_group
,
get_tensor_model_parallel_world_size
)
get_tensor_model_parallel_world_size
,
tensor_model_parallel_reduce_scatter
,
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_reduce
,
get_tensor_model_parallel_rank
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
,
SharedFusedMoE
...
...
@@ -209,8 +213,7 @@ class DeepseekV2MoE(nn.Module):
if
config
.
n_shared_experts
is
not
None
:
intermediate_size
=
(
config
.
moe_intermediate_size
*
config
.
n_shared_experts
)
shared_expert_cls
=
DeepseekV2MLP
if
not
self
.
use_mori_ep
else
EPSharedExperts
self
.
shared_experts
=
shared_expert_cls
(
self
.
shared_experts
=
EPSharedExperts
(
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
...
...
@@ -710,6 +713,33 @@ class DeepseekV2DecoderLayer(nn.Module):
# with the layer's index.
layer_idx
=
int
(
prefix
.
split
(
sep
=
'.'
)[
-
1
])
self
.
layer_idx
=
layer_idx
self
.
dp_size
=
get_dp_group
().
world_size
vllm_config
=
get_current_vllm_config
()
parallel_config
=
vllm_config
.
parallel_config
self
.
use_deepep
=
self
.
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
and
\
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
or
\
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_low_latency"
)
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
if
(
config
.
n_routed_experts
is
not
None
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
):
self
.
mlp
=
DeepseekV2MoE
(
config
=
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
enable_eplb
=
enable_eplb
,
)
else
:
self
.
mlp
=
DeepseekV2MLP
(
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
)
if
model_config
.
use_mla
:
attn_cls
=
DeepseekV2MLAAttention
else
:
...
...
@@ -732,23 +762,6 @@ class DeepseekV2DecoderLayer(nn.Module):
prefix
=
f
"
{
prefix
}
.self_attn"
,
)
if
(
config
.
n_routed_experts
is
not
None
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
):
self
.
mlp
=
DeepseekV2MoE
(
config
=
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
enable_eplb
=
enable_eplb
,
)
else
:
self
.
mlp
=
DeepseekV2MLP
(
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
)
self
.
input_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
post_attention_layernorm
=
RMSNorm
(
config
.
hidden_size
,
...
...
@@ -833,8 +846,25 @@ class DeepseekV2DecoderLayer(nn.Module):
# Fully Connected
hidden_states
,
residual
=
self
.
post_attention_layernorm
(
hidden_states
,
residual
)
if
isinstance
(
self
.
mlp
,
DeepseekV2MoE
)
and
self
.
use_deepep
and
self
.
tp_size
>
1
:
self
.
tp_rank
=
get_tensor_model_parallel_rank
()
ori_bs
=
hidden_states
.
shape
[
0
]
pad_size
=
(
ori_bs
+
self
.
tp_size
-
1
)
//
self
.
tp_size
*
self
.
tp_size
-
ori_bs
if
pad_size
>
0
:
hidden_states
=
torch
.
nn
.
functional
.
pad
(
hidden_states
.
contiguous
(),
[
0
,
0
,
0
,
pad_size
],
value
=
0
).
contiguous
()
new_bs
=
(
ori_bs
+
pad_size
)
//
self
.
tp_size
hidden_states
=
hidden_states
[
self
.
tp_rank
*
new_bs
:
(
self
.
tp_rank
+
1
)
*
new_bs
,
:]
hidden_states
=
self
.
mlp
(
hidden_states
)
if
isinstance
(
self
.
mlp
,
DeepseekV2MoE
)
and
self
.
use_deepep
and
self
.
tp_size
>
1
:
hidden_states
=
tensor_model_parallel_all_gather
(
hidden_states
,
dim
=
0
).
contiguous
()
hidden_states
=
hidden_states
[:
ori_bs
,
:].
contiguous
()
if
isinstance
(
self
.
mlp
,
DeepseekV2MLP
)
and
hidden_states
.
dtype
==
torch
.
float16
:
# Fix FP16 overflow
...
...
vllm/model_executor/parameter.py
View file @
6cabbf16
...
...
@@ -96,6 +96,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
def
__init__
(
self
,
output_dim
:
int
,
**
kwargs
):
self
.
_output_dim
=
output_dim
super
().
__init__
(
**
kwargs
)
self
.
expect_tp_size
=
-
1
@
property
def
output_dim
(
self
):
...
...
@@ -103,6 +105,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
def
load_column_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
):
tp_rank
=
get_tensor_model_parallel_rank
()
if
self
.
expect_tp_size
==
1
:
tp_rank
=
0
shard_size
=
self
.
data
.
shape
[
self
.
output_dim
]
loaded_weight
=
loaded_weight
.
narrow
(
self
.
output_dim
,
tp_rank
*
shard_size
,
shard_size
)
...
...
@@ -123,6 +127,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
param_data
=
self
.
data
tp_rank
=
get_tensor_model_parallel_rank
()
if
self
.
expect_tp_size
==
1
:
tp_rank
=
0
param_data
=
param_data
.
narrow
(
self
.
output_dim
,
shard_offset
,
shard_size
)
loaded_weight
=
loaded_weight
.
narrow
(
self
.
output_dim
,
...
...
@@ -167,6 +173,7 @@ class RowvLLMParameter(BasevLLMParameter):
def
__init__
(
self
,
input_dim
:
int
,
**
kwargs
):
self
.
_input_dim
=
input_dim
super
().
__init__
(
**
kwargs
)
self
.
expect_tp_size
=
-
1
@
property
def
input_dim
(
self
):
...
...
@@ -174,6 +181,8 @@ class RowvLLMParameter(BasevLLMParameter):
def
load_row_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
):
tp_rank
=
get_tensor_model_parallel_rank
()
if
self
.
expect_tp_size
==
1
:
tp_rank
=
0
shard_size
=
self
.
data
.
shape
[
self
.
input_dim
]
loaded_weight
=
loaded_weight
.
narrow
(
self
.
input_dim
,
tp_rank
*
shard_size
,
shard_size
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment