[Bugfix][MoE] Fix 6-8% decode regression: prefer multi-stream shared expert overlap (#38990)

Signed-off-by: Martin Vit <martin@voipmonitor.org> Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>

[Bugfix][MoE] Fix 6-8% decode regression: prefer multi-stream shared expert overlap (#38990)
Signed-off-by: Martin Vit <martin@voipmonitor.org> Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
228023b3 · Martin Vit · GitHub · 9a528260 · 228023b3
Unverified Commit 228023b3 authored Apr 05, 2026 by Martin Vit Committed by GitHub Apr 05, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 9 deletions

vllm/model_executor/layers/fused_moe/runner/shared_experts.py .../model_executor/layers/fused_moe/runner/shared_experts.py +9 -9

No files found.
--- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py
+++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py
@@ -93,24 +93,24 @@ class SharedExperts:
                )

    @property
-    def _has_external_experts(self) -> bool:
+    def _use_external_experts(self) -> bool:
+        if self._use_dp_chunking:
+            return False
+
        # Disable shared expert overlap if:
        #   - we are using eplb with non-default backend, because of correctness issues
        #   - we are using flashinfer with DP, since there nothing to gain
        backend = self._moe_config.moe_parallel_config.all2all_backend
-        return not (
-            (
-                self._moe_config.moe_parallel_config.enable_eplb
-                and backend != "allgather_reducescatter"
-            )
-            or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels
-        )
+        return (
+            self._moe_config.moe_parallel_config.enable_eplb
+            and backend != "allgather_reducescatter"
+        ) or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels

    def _determine_shared_experts_order(
        self,
        hidden_states: torch.Tensor,
    ) -> SharedExpertsOrder:
-        if self._has_external_experts and not self._use_dp_chunking:
+        if self._use_external_experts:
            return SharedExpertsOrder.EXTERNAL

        if self._quant_method.mk_owns_shared_expert: