Commit 8419f911 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-ds-wm' into 'v0.9.2-dev-ds'

[fix]优化mori ep

See merge request dcutoolkit/deeplearing/vllm!222
parents c502ffca 8b791547
......@@ -949,6 +949,7 @@ def init_distributed_environment(
backend = "gloo"
# this backend is used for WORLD
parallel_config = config.parallel_config
data_parallel_size = parallel_config.data_parallel_size
use_mori_ep = envs.VLLM_USE_MORI_EP and data_parallel_size > 1 and parallel_config.enable_expert_parallel
if use_mori_ep:
......
......@@ -21,11 +21,14 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, Unqua
from vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher import MoEAlltoAllTokenDispatcher
from vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis import EpMoeConfig
from vllm.utils import direct_register_custom_op
import mori
import torch.distributed as dist
from lmslim.layers.gemm.int8_utils import (
per_token_quant_int8)
try:
import mori
from lmslim.layers.gemm.int8_utils import (
per_token_quant_int8)
except ImportError:
is_mori_available = False
logger = init_logger(__name__)
......@@ -239,6 +242,8 @@ class EPMoE(FusedMoE):
self.scales = None
self.use_int8_dispatch = True
vllm_config = get_current_vllm_config()
self.max_num_inp_token_per_rank = vllm_config.scheduler_config.max_num_seqs
self.mori_op = self.get_mori_op()
self.first = True
......@@ -270,7 +275,7 @@ class EPMoE(FusedMoE):
hidden_dim=self.hidden_size,
scale_dim=1 if self.use_int8_dispatch else 0,
scale_type_size=mori_scale_type_size,
max_num_inp_token_per_rank=512,
max_num_inp_token_per_rank=self.max_num_inp_token_per_rank,
num_experts_per_rank=self.local_num_experts,
num_experts_per_token=self.top_k,
max_token_type_size=2,
......@@ -381,16 +386,33 @@ class EPMoE(FusedMoE):
)
#self.sync()
expect_m = hidden_states.shape[0] * self.ep_size
dispatch_output_clip = dispatch_output[:expect_m]
dispatch_weights_clip = dispatch_weights[:expect_m]
dispatch_indices_clip = dispatch_indices[:expect_m]
dispatch_scales_clip = dispatch_scales[:expect_m]
# expect_m = topk_ids.shape[0] * self.ep_size
# dispatch_output_clip = dispatch_output[:expect_m]
# dispatch_weights_clip = dispatch_weights[:expect_m]
# dispatch_indices_clip = dispatch_indices[:expect_m]
# dispatch_scales_clip = dispatch_scales[:expect_m]
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output_clip,
# topk_weights=dispatch_weights_clip,
# topk_ids=dispatch_indices_clip,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0],
# scales=dispatch_scales_clip if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
expert_output = self.quant_method.apply_ep(
layer=self,
x=dispatch_output_clip,
topk_weights=dispatch_weights_clip,
topk_ids=dispatch_indices_clip,
x=dispatch_output,
topk_weights=dispatch_weights,
topk_ids=dispatch_indices,
global_num_experts=self.global_num_experts,
expert_map=self.expert_map,
activation=self.activation,
......@@ -398,25 +420,9 @@ class EPMoE(FusedMoE):
use_nn_moe=self.use_nn_moe,
num_local_tokens=dispatch_recv_num_token,
config_select_bs=hidden_states.shape[0],
scales=dispatch_scales_clip if self.use_int8_dispatch else None
scales=dispatch_scales if self.use_int8_dispatch else None
#routed_scaling_factor=self.routed_scaling_factor,
)
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output,
# topk_weights=dispatch_weights,
# topk_ids=dispatch_indices,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0]*2,
# scales=dispatch_scales if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
#self.sync()
combine_output, _ = self.mori_op.combine(expert_output, dispatch_weights, topk_ids)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment