Commit 55f7b089 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-ds' of http://10.16.6.30/dcutoolkit/deeplearing/vllm into v0.9.2-dev-ds

parents 5ca1259e ab485158
......@@ -43,8 +43,8 @@ from vllm.distributed import (get_ep_group, get_pp_group, get_dp_group,
get_tensor_model_parallel_world_size)
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe.ep_moe.layer import EPMoE
from vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis import EPSharedExperts
from vllm.model_executor.layers.fused_moe.mori_moe.layer import MoriMoE
from vllm.model_executor.layers.fused_moe.mori_moe.ep_moe_utlis import EPSharedExperts
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear,
......@@ -167,9 +167,10 @@ class DeepseekV2MoE(nn.Module):
self.n_local_physical_experts)
dp_size = get_dp_group().world_size
self.use_mori_ep = envs.VLLM_USE_MORI_EP and dp_size > 1 and parallel_config.enable_expert_parallel
self.use_mori_ep = parallel_config.enable_expert_parallel and dp_size > 1 and envs.VLLM_ALL2ALL_BACKEND == 'mori'
self.enable_expert_parallel = parallel_config.enable_expert_parallel
moe_cls = FusedMoE if not self.use_mori_ep else EPMoE
moe_cls = FusedMoE if not self.use_mori_ep else MoriMoE
self.experts = moe_cls(
num_experts=config.n_routed_experts,
top_k=config.num_experts_per_tok,
......@@ -224,12 +225,12 @@ class DeepseekV2MoE(nn.Module):
# router_logits: (num_tokens, n_experts)
router_logits, _ = self.gate(hidden_states)
if not self.use_mori_ep:
if not self.enable_expert_parallel:
if envs.VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD:
final_hidden_states = self.experts(
hidden_states=hidden_states,
router_logits=router_logits,
shared_output=shared_output)
hidden_states=hidden_states,
router_logits=router_logits,
shared_output=shared_output)
else:
if hidden_states.dtype != torch.float16:
final_hidden_states = self.experts(
......@@ -248,8 +249,22 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details.
final_hidden_states = final_hidden_states + shared_output \
* (1. / self.routed_scaling_factor)
else:
final_hidden_states = self.experts(hidden_states=hidden_states,
else:
if not self.use_mori_ep:
final_hidden_states = self.experts(
hidden_states=hidden_states,
router_logits=router_logits)
if shared_output is not None:
if hidden_states.dtype != torch.float16:
final_hidden_states = final_hidden_states + shared_output
else:
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
final_hidden_states = final_hidden_states + shared_output \
* (1. / self.routed_scaling_factor)
else:
final_hidden_states = self.experts(hidden_states=hidden_states,
router_logits=router_logits)
if not self.use_mori_ep:
......@@ -927,7 +942,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
parallel_config = vllm_config.parallel_config
dp_size = get_dp_group().world_size
self.use_mori_ep = envs.VLLM_USE_MORI_EP and dp_size > 1 and parallel_config.enable_expert_parallel
self.use_mori_ep = envs.VLLM_ALL2ALL_BACKEND == 'mori' and dp_size > 1 and parallel_config.enable_expert_parallel
def set_eplb_state(
self,
......@@ -1173,4 +1188,4 @@ def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
for i in range(config.num_nextn_predict_layers):
if weight_name.startswith(f"model.layers.{layer_idx+i}."):
return layer_idx + i
return None
return None
\ No newline at end of file
......@@ -513,8 +513,11 @@ class EagleProposer:
self.hidden_states[:num_tokens],
)
if self.dp_size > 1 and self.enable_expert_parallel and self.num_speculative_tokens > 1:
for _ in range(self.num_speculative_tokens - 1):
if self.dp_size > 1 and self.enable_expert_parallel and self.num_speculative_tokens > 1:
for _ in range(self.num_speculative_tokens - 1):
with set_forward_context(attn_metadata,
self.vllm_config,
num_tokens=num_tokens):
self.model(
self.input_ids[:num_tokens],
self.positions[:num_tokens],
......
......@@ -323,9 +323,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# from the KV cache of `shared_kv_cache_layers[layer_name]`.
self.shared_kv_cache_layers: dict[str, str] = {}
dp_size = self.vllm_config.parallel_config.data_parallel_size
self.use_mori_ep = envs.VLLM_USE_MORI_EP and dp_size > 1 and parallel_config.enable_expert_parallel
def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
"""
Update the order of requests in the batch based on the attention
......@@ -1238,7 +1235,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations.
if dp_size == 1 or self.vllm_config.model_config.enforce_eager or self.use_mori_ep:
if dp_size == 1 or self.vllm_config.model_config.enforce_eager or envs.VLLM_ALL2ALL_BACKEND == 'naive':
# Early exit.
return 0, None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment