Unverified Commit 228023b3 authored by Martin Vit's avatar Martin Vit Committed by GitHub
Browse files

[Bugfix][MoE] Fix 6-8% decode regression: prefer multi-stream shared expert overlap (#38990)


Signed-off-by: default avatarMartin Vit <martin@voipmonitor.org>
Signed-off-by: default avatarRobert Shaw <robshaw@redhat.com>
Co-authored-by: default avatarRobert Shaw <robshaw@redhat.com>
Co-authored-by: default avatarRobert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
parent 9a528260
...@@ -93,24 +93,24 @@ class SharedExperts: ...@@ -93,24 +93,24 @@ class SharedExperts:
) )
@property @property
def _has_external_experts(self) -> bool: def _use_external_experts(self) -> bool:
if self._use_dp_chunking:
return False
# Disable shared expert overlap if: # Disable shared expert overlap if:
# - we are using eplb with non-default backend, because of correctness issues # - we are using eplb with non-default backend, because of correctness issues
# - we are using flashinfer with DP, since there nothing to gain # - we are using flashinfer with DP, since there nothing to gain
backend = self._moe_config.moe_parallel_config.all2all_backend backend = self._moe_config.moe_parallel_config.all2all_backend
return not ( return (
( self._moe_config.moe_parallel_config.enable_eplb
self._moe_config.moe_parallel_config.enable_eplb and backend != "allgather_reducescatter"
and backend != "allgather_reducescatter" ) or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels
)
or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels
)
def _determine_shared_experts_order( def _determine_shared_experts_order(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
) -> SharedExpertsOrder: ) -> SharedExpertsOrder:
if self._has_external_experts and not self._use_dp_chunking: if self._use_external_experts:
return SharedExpertsOrder.EXTERNAL return SharedExpertsOrder.EXTERNAL
if self._quant_method.mk_owns_shared_expert: if self._quant_method.mk_owns_shared_expert:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment