Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8419f911
Commit
8419f911
authored
Oct 10, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev-ds-wm' into 'v0.9.2-dev-ds'
[fix]优化mori ep See merge request dcutoolkit/deeplearing/vllm!222
parents
c502ffca
8b791547
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
29 deletions
+36
-29
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+1
-0
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
+35
-29
No files found.
vllm/distributed/parallel_state.py
View file @
8419f911
...
@@ -949,6 +949,7 @@ def init_distributed_environment(
...
@@ -949,6 +949,7 @@ def init_distributed_environment(
backend
=
"gloo"
backend
=
"gloo"
# this backend is used for WORLD
# this backend is used for WORLD
parallel_config
=
config
.
parallel_config
data_parallel_size
=
parallel_config
.
data_parallel_size
data_parallel_size
=
parallel_config
.
data_parallel_size
use_mori_ep
=
envs
.
VLLM_USE_MORI_EP
and
data_parallel_size
>
1
and
parallel_config
.
enable_expert_parallel
use_mori_ep
=
envs
.
VLLM_USE_MORI_EP
and
data_parallel_size
>
1
and
parallel_config
.
enable_expert_parallel
if
use_mori_ep
:
if
use_mori_ep
:
...
...
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
View file @
8419f911
...
@@ -21,11 +21,14 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, Unqua
...
@@ -21,11 +21,14 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, Unqua
from
vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher
import
MoEAlltoAllTokenDispatcher
from
vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher
import
MoEAlltoAllTokenDispatcher
from
vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis
import
EpMoeConfig
from
vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis
import
EpMoeConfig
from
vllm.utils
import
direct_register_custom_op
from
vllm.utils
import
direct_register_custom_op
import
mori
import
torch.distributed
as
dist
import
torch.distributed
as
dist
from
lmslim.layers.gemm.int8_utils
import
(
try
:
per_token_quant_int8
)
import
mori
from
lmslim.layers.gemm.int8_utils
import
(
per_token_quant_int8
)
except
ImportError
:
is_mori_available
=
False
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -239,6 +242,8 @@ class EPMoE(FusedMoE):
...
@@ -239,6 +242,8 @@ class EPMoE(FusedMoE):
self
.
scales
=
None
self
.
scales
=
None
self
.
use_int8_dispatch
=
True
self
.
use_int8_dispatch
=
True
vllm_config
=
get_current_vllm_config
()
self
.
max_num_inp_token_per_rank
=
vllm_config
.
scheduler_config
.
max_num_seqs
self
.
mori_op
=
self
.
get_mori_op
()
self
.
mori_op
=
self
.
get_mori_op
()
self
.
first
=
True
self
.
first
=
True
...
@@ -270,7 +275,7 @@ class EPMoE(FusedMoE):
...
@@ -270,7 +275,7 @@ class EPMoE(FusedMoE):
hidden_dim
=
self
.
hidden_size
,
hidden_dim
=
self
.
hidden_size
,
scale_dim
=
1
if
self
.
use_int8_dispatch
else
0
,
scale_dim
=
1
if
self
.
use_int8_dispatch
else
0
,
scale_type_size
=
mori_scale_type_size
,
scale_type_size
=
mori_scale_type_size
,
max_num_inp_token_per_rank
=
512
,
max_num_inp_token_per_rank
=
self
.
max_num_inp_token_per_rank
,
num_experts_per_rank
=
self
.
local_num_experts
,
num_experts_per_rank
=
self
.
local_num_experts
,
num_experts_per_token
=
self
.
top_k
,
num_experts_per_token
=
self
.
top_k
,
max_token_type_size
=
2
,
max_token_type_size
=
2
,
...
@@ -381,16 +386,33 @@ class EPMoE(FusedMoE):
...
@@ -381,16 +386,33 @@ class EPMoE(FusedMoE):
)
)
#self.sync()
#self.sync()
expect_m
=
hidden_states
.
shape
[
0
]
*
self
.
ep_size
# expect_m = topk_ids.shape[0] * self.ep_size
dispatch_output_clip
=
dispatch_output
[:
expect_m
]
# dispatch_output_clip = dispatch_output[:expect_m]
dispatch_weights_clip
=
dispatch_weights
[:
expect_m
]
# dispatch_weights_clip = dispatch_weights[:expect_m]
dispatch_indices_clip
=
dispatch_indices
[:
expect_m
]
# dispatch_indices_clip = dispatch_indices[:expect_m]
dispatch_scales_clip
=
dispatch_scales
[:
expect_m
]
# dispatch_scales_clip = dispatch_scales[:expect_m]
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output_clip,
# topk_weights=dispatch_weights_clip,
# topk_ids=dispatch_indices_clip,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0],
# scales=dispatch_scales_clip if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
expert_output
=
self
.
quant_method
.
apply_ep
(
expert_output
=
self
.
quant_method
.
apply_ep
(
layer
=
self
,
layer
=
self
,
x
=
dispatch_output
_clip
,
x
=
dispatch_output
,
topk_weights
=
dispatch_weights
_clip
,
topk_weights
=
dispatch_weights
,
topk_ids
=
dispatch_indices
_clip
,
topk_ids
=
dispatch_indices
,
global_num_experts
=
self
.
global_num_experts
,
global_num_experts
=
self
.
global_num_experts
,
expert_map
=
self
.
expert_map
,
expert_map
=
self
.
expert_map
,
activation
=
self
.
activation
,
activation
=
self
.
activation
,
...
@@ -398,25 +420,9 @@ class EPMoE(FusedMoE):
...
@@ -398,25 +420,9 @@ class EPMoE(FusedMoE):
use_nn_moe
=
self
.
use_nn_moe
,
use_nn_moe
=
self
.
use_nn_moe
,
num_local_tokens
=
dispatch_recv_num_token
,
num_local_tokens
=
dispatch_recv_num_token
,
config_select_bs
=
hidden_states
.
shape
[
0
],
config_select_bs
=
hidden_states
.
shape
[
0
],
scales
=
dispatch_scales
_clip
if
self
.
use_int8_dispatch
else
None
scales
=
dispatch_scales
if
self
.
use_int8_dispatch
else
None
#routed_scaling_factor=self.routed_scaling_factor,
#routed_scaling_factor=self.routed_scaling_factor,
)
)
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output,
# topk_weights=dispatch_weights,
# topk_ids=dispatch_indices,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0]*2,
# scales=dispatch_scales if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
#self.sync()
#self.sync()
combine_output
,
_
=
self
.
mori_op
.
combine
(
expert_output
,
dispatch_weights
,
topk_ids
)
combine_output
,
_
=
self
.
mori_op
.
combine
(
expert_output
,
dispatch_weights
,
topk_ids
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment