feat: enable shared expert overlap.

ee19dca6 · wanglong3 · ffc00331 · ee19dca6 · ee19dca6 · ee19dca6
Commit ee19dca6 authored Jan 03, 2026 by wanglong3
5 changed files
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1939,6 +1939,24 @@ class ParallelConfig:
        assert last_exc is not None
        raise last_exc
+    # The all_reduce at the end of attention (during o_proj) means that
+    # inputs are replicated across each rank of the tensor parallel group.
+    # If using expert-parallelism with DeepEP All2All ops, replicated
+    # tokens results in useless duplicate computation and communication.
+    #
+    # In this case, ensure the input to the experts is sequence parallel
+    # to avoid the excess work.
+    #
+    # Not needed for pplx-kernels as it can handle duplicate input tokens.
+    @property
+    def use_sequence_parallel_moe(self) -> bool:
+        return (envs.VLLM_ALL2ALL_BACKEND
+                in ("allgather_reducescatter", "naive",
+                    "deepep_high_throughput", "deepep_low_latency")
+                and self.enable_expert_parallel
+                and self.tensor_parallel_size > 1
+                and self.data_parallel_size > 1)
    @staticmethod
    def has_unfinished_dp(dp_group: "ProcessGroup",
                          has_unfinished: bool) -> bool:

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -204,6 +204,7 @@ if TYPE_CHECKING:
    VLLM_ZERO_OVERHEAD_ENHANCE: bool = False
    VLLM_USE_FUSED_QA_KVA_GEMM: bool = False
    VLLM_V1_FAST_TOKEN_ID_COPY: bool = False
+    VLLM_DISABLE_SHARED_EXPERTS_STREAM:bool = True
 def get_default_cache_root():
    return os.getenv(
@@ -1306,6 +1307,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_ENABLE_DEEPEP_HT_DEEPGEMM":
        lambda: (os.getenv('VLLM_ENABLE_DEEPEP_HT_DEEPGEMM', '1').lower() in
                 ("true", "1")),
    # Only quantized DeepSeek models supported.
    # Unquantized versions are not supported.
    "VLLM_USE_FUSED_QA_KVA_GEMM":
@@ -1318,6 +1320,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_V1_FAST_TOKEN_ID_COPY":
        lambda: (os.environ.get("VLLM_V1_FAST_TOKEN_ID_COPY", "False").lower() in
                 ("true", "1")),
+    "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool(
+        int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "1"))
+    ),
 }
 # --8<-- [end:env-vars-definition]

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -28,8 +28,8 @@ from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig, FusedMoEParallelConfig)
 # yapf: enable
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEActivationFormat, FusedMoEModularKernel, 
+    FusedMoEActivationFormat, FusedMoEModularKernel,
-    DeepGemmDisabledFusedMoEModularKernel, FusedMoEPermuteExpertsUnpermute, 
+    DeepGemmDisabledFusedMoEModularKernel, FusedMoEPermuteExpertsUnpermute,
    FusedMoEPrepareAndFinalize)
 # from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
 #     is_rocm_aiter_moe_enabled)
@@ -74,6 +74,26 @@ else:
 logger = init_logger(__name__)
+# Global auxilary stream for running operations in background streams.
+# We have single global auxilary stream to avoid an explosion of streams
+# for every layer (and make profiling look sane).
+#
+# aux_stream() is currently used for:
+#   - MoE shared_expert overlap with router
+_aux_stream: torch.cuda.Stream | None = None
+def aux_stream() -> torch.cuda.Stream | None:
+    """
+    Ensures aux_stream is initialized only once
+    """
+    global _aux_stream
+    from vllm.platforms import current_platform
+    if _aux_stream is None and current_platform.is_cuda_alike():
+        _aux_stream = torch.cuda.Stream()
+    return _aux_stream
 class FusedMoeWeightScaleSupported(Enum):
    TENSOR = "tensor"
@@ -170,7 +190,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                                == current_platform.fp8_dtype()
                                and moe.quant_config.block_shape
                                == DEEPEP_QUANT_BLOCK_SHAPE)
            use_int8_dispatch = moe.quant_config.quant_dtype == torch.int8
            # Note (varun): Whether to use FP8 dispatch or not needs some
@@ -698,6 +718,21 @@ class FusedMoE(torch.nn.Module):
        routed_scaling_factor: Optional[float] = 1.0,
    ):
        super().__init__()
+        # Allow disabling of the separate shared experts stream for
+        # debug purposes.
+        # TODO: Remove this after more extensive testings with TP/DP
+        # and other execution modes
+        if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM:
+            logger.info_once("Disabling MoE shared_experts cuda stream")
+            self.shared_experts_stream = None
+        else:
+            # TODO(rob): enable shared expert overlap with non-cuda-alike.
+            # aux_stream() returns None on non-cuda-alike platforms.
+            self.shared_experts_stream = aux_stream()
+            if self.shared_experts_stream is not None:
+                logger.info_once("Enabled separate cuda stream for MoE shared_experts")
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.params_dtype = params_dtype
@@ -814,7 +849,7 @@ class FusedMoE(torch.nn.Module):
                # please refer to the implementation in `Fp8MoEMethod`.
                raise NotImplementedError("EPLB is only supported for FP8 "
                                          "quantization for now.")
        if quant_config is None:
            # Not considering quant for now, temporarily
            self.use_nn_moe = int(os.environ.get('MOE_NN', 1)) == 1
@@ -909,9 +944,9 @@ class FusedMoE(torch.nn.Module):
    @property
    def use_deepep_ll_kernels(self):
        return self.moe_parallel_config.use_deepep_ll_kernels
    @property
-    def shared_experts(self) -> Optional[torch.nn.Module]:
+    def shared_experts(self) -> torch.nn.Module | None:
        return None
    def _load_per_tensor_weight_scale(self, shard_id: str,
@@ -1451,6 +1486,7 @@ class FusedMoE(torch.nn.Module):
    def forward(self, hidden_states: torch.Tensor,
                router_logits: torch.Tensor,
+                hidden_states_copy: Optional[torch.Tensor] = None,  # for shared expert overlap
                shared_output: Optional[torch.Tensor] = None,
                i_q: Optional[torch.Tensor] = None,
                i_s: Optional[torch.Tensor] = None, **_
@@ -1458,7 +1494,7 @@ class FusedMoE(torch.nn.Module):
        # TODO: Once the OOM issue for the TPU backend is resolved, we will
        # switch to using the moe_forward custom op.
        if current_platform.is_tpu():
-            assert i_q is None and i_s is None, "moe.quant fused not support TPU now" 
+            assert i_q is None and i_s is None, "moe.quant fused not support TPU now"
            return self.forward_impl(hidden_states, router_logits)
        else:
            if self.shared_experts is None:
@@ -1467,7 +1503,7 @@ class FusedMoE(torch.nn.Module):
                                                i_q, i_s)
            else:
                return torch.ops.vllm.moe_forward_shared(hidden_states, router_logits,
-                                                self.layer_name, shared_output)
+                                                self.layer_name, hidden_states_copy, shared_output)
    def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                             full_router_logits: torch.Tensor):
@@ -1547,10 +1583,22 @@ class FusedMoE(torch.nn.Module):
    def forward_impl(self, hidden_states: torch.Tensor,
                     router_logits: torch.Tensor,
+                     hidden_states_copy: Optional[torch.Tensor] = None,
                     shared_output: Optional[torch.Tensor] = None,
                     i_q: Optional[torch.Tensor] = None,
-                     i_s: Optional[torch.Tensor] = None, **_):
+                     i_s: Optional[torch.Tensor] = None, **_)-> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        assert self.quant_method is not None
+        enable_shared_experts_overlap = False
+        if (self.shared_experts_stream is not None
+                and hidden_states_copy is not None
+                and self.shared_experts is not None
+                and not self.moe_parallel_config.use_pplx_kernels):
+            enable_shared_experts_overlap = True
+            hidden_states_copy.record_stream(self.shared_experts_stream)
+            self.shared_experts_stream.wait_stream(torch.cuda.current_stream())
        if (self.moe_parallel_config.use_pplx_kernels):
                #or self.moe_parallel_config.use_deepep_ll_kernels):
            return self.forward_impl_chunked(hidden_states, router_logits)
@@ -1619,18 +1667,45 @@ class FusedMoE(torch.nn.Module):
                use_fused_gate=self.use_fused_gate,
            )
-        if do_naive_dispatch_combine:
+            if enable_shared_experts_overlap:
-            final_hidden_states = get_ep_group().combine(final_hidden_states)
+                assert self.shared_experts is not None
+                # Run shared experts in parallel on a separate stream
+                # NOTE: We start the separate stream here and mark the
+                # sync end point immediately after it is done. This is
+                # important to avoid excessive stream allocations by the cuda
+                # graph replay later.
+                with torch.cuda.stream(self.shared_experts_stream):
+                    # Note that hidden_states clone() is necessary here to avoid
+                    # conflict with the main stream
+                    assert hidden_states_copy is not None
+                    shared_output = self.shared_experts(hidden_states_copy)
+                torch.cuda.current_stream().wait_stream(self.shared_experts_stream)
+                final_hidden_states = (
+                    shared_output,
+                    final_hidden_states,
+                )
-        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+        def combine_output(states: torch.Tensor) -> torch.Tensor:
-            # Default set to False. (May have to add shared expert outputs.
+            if do_naive_dispatch_combine:
-            if envs.VLLM_ENABLE_TBO:
+                states = get_ep_group().combine(states)
-                final_hidden_states = self.tbo_all_reduce(final_hidden_states)
-            else:
-                final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
-                    final_hidden_states)
-        return final_hidden_states
+            if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+                # Default set to False. (May have to add shared expert outputs.
+                if envs.VLLM_ENABLE_TBO:
+                    states = self.tbo_all_reduce(states)
+                else:
+                    states = self.maybe_all_reduce_tensor_model_parallel(
+                        states)
+            return states
+        if enable_shared_experts_overlap and not envs.USE_FUSED_RMS_QUANT:
+            return (
+                final_hidden_states[0],
+                combine_output(final_hidden_states[1]),
+            )
+        else:
+            return combine_output(final_hidden_states)
    @classmethod
    def make_expert_params_mapping(
@@ -1686,7 +1761,7 @@ class FusedMoE(torch.nn.Module):
        return s
-def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor, 
+def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
                layer_name: str, shared_output: Optional[torch.Tensor] = None,
                i_q: Optional[torch.Tensor] = None,
                i_s: Optional[torch.Tensor] = None) -> torch.Tensor:
@@ -1697,7 +1772,7 @@ def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
        return self.forward_impl(hidden_states, router_logits, shared_output, i_q, i_s)
    else:
        return self.forward_impl(hidden_states, router_logits, shared_output)
 def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
@@ -1720,18 +1795,20 @@ def moe_forward_shared(
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    layer_name: str,
+    hidden_states_copy: Optional[torch.Tensor] = None,
    shared_output: Optional[torch.Tensor] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
    forward_context: ForwardContext = get_forward_context()
    self = forward_context.no_compile_layers[layer_name]
    assert self.shared_experts is not None
-    return self.forward_impl(hidden_states, router_logits, shared_output)
+    return self.forward_impl(hidden_states, router_logits, hidden_states_copy, shared_output)
 def moe_forward_shared_fake(
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    layer_name: str,
+    hidden_states_copy: Optional[torch.Tensor] = None,
    shared_output: Optional[torch.Tensor] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
    shared_out = torch.empty_like(hidden_states)
@@ -1742,7 +1819,7 @@ def moe_forward_shared_fake(
 direct_register_custom_op(
    op_name="moe_forward_shared",
    op_func=moe_forward_shared,
-    mutates_args=["hidden_states"],
+    mutates_args=["hidden_states", "hidden_states_copy"],
    fake_impl=moe_forward_shared_fake,
    tags=(torch.Tag.needs_fixed_stride_order,),
 )
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -34,7 +34,8 @@ class SharedFusedMoE(FusedMoE):
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
-    ) -> torch.Tensor:
+        hidden_states_copy: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]|torch.Tensor:
        if not self.use_overlapped:
            shared_out = self._shared_experts(hidden_states)
@@ -53,6 +54,6 @@ class SharedFusedMoE(FusedMoE):
            fused_out = super().forward(
                hidden_states=hidden_states,
                router_logits=router_logits,
+                hidden_states_copy = hidden_states_copy,
            )
        return fused_out
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py