"vscode:/vscode.git/clone" did not exist on "879f69bed375968ce02e003883752754beb36111"
Unverified Commit 13cdc021 authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

Fix MTP with deepep_low_latency (#25904)


Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
parent 502640c3
......@@ -1899,6 +1899,15 @@ class FusedMoE(CustomOp):
staged_hidden_states.copy_(hidden_states, non_blocking=True)
staged_router_logits.copy_(router_logits, non_blocking=True)
# If there are shared experts but we are not using a modular kernel,
# the shared experts must be called here
if (not isinstance(self.quant_method.fused_experts,
FusedMoEModularKernel)
and self.shared_experts is not None):
shared_output = self.shared_experts(staged_hidden_states)
else:
shared_output = None
# Matrix multiply.
final_hidden_states = self.quant_method.apply(
layer=self,
......@@ -1922,8 +1931,13 @@ class FusedMoE(CustomOp):
logical_replica_count=self.logical_replica_count,
)
assert self.shared_experts is None or isinstance(
final_hidden_states, tuple)
if shared_output is not None:
assert not isinstance(final_hidden_states, tuple)
assert self.shared_experts is not None
final_hidden_states = (
shared_output,
final_hidden_states,
)
if self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(final_hidden_states, tuple)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment