Update Mooncake EP's a2a interface (#12391)

0afd6832 · Xun Sun · GitHub · 6f858930 · 0afd6832 · 0afd6832
Unverified Commit 0afd6832 authored Nov 02, 2025 by Xun Sun Committed by GitHub Nov 01, 2025
Showing with 18 additions and 11 deletions

python/sglang/srt/layers/moe/token_dispatcher/mooncake.py python/sglang/srt/layers/moe/token_dispatcher/mooncake.py +16 -8

python/sglang/srt/layers/quantization/fp8.py python/sglang/srt/layers/quantization/fp8.py +2 -3

No files found.
--- a/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py
+++ b/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 import logging
 from dataclasses import dataclass
-from typing import NamedTuple, Optional, Tuple
+from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple
 from sglang.srt.elastic_ep.elastic_ep import ElasticEPStateManager
 from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
@@ -18,6 +18,9 @@ from sglang.srt.layers.moe.topk import TopKOutput
 from sglang.srt.layers.moe.utils import DeepEPMode
 from sglang.srt.utils import get_int_env_var
+if TYPE_CHECKING:
+    from sglang.srt.single_batch_overlap import CombineOverlapArgs
 try:
    from mooncake.mooncake_ep_buffer import Buffer
@@ -234,13 +237,14 @@ class _MooncakeEPDispatcherImpl:
        hidden_states: torch.Tensor,
        topk_ids: torch.Tensor,
        topk_weights: torch.Tensor,
+        overlap_args: Optional[CombineOverlapArgs] = None,
    ):
        hidden_states, event, hook = self._combine_core(
            hidden_states,
            topk_ids,
            topk_weights,
        )
-        return hidden_states, event, hook
+        return hidden_states, event, hook, overlap_args
    def combine_b(self, hidden_states, event, hook):
        hook() if self.return_recv_hook else event.current_stream_wait()
@@ -342,23 +346,27 @@ class MooncakeEPDispatcher(BaseDispatcher):
        del self._dispatch_intermediate_state
        return self._get_impl().dispatch_b(*inner_state)
-    def combine(self, *args, **kwargs) -> Tuple:
+    def combine(
-        self.combine_a(*args, **kwargs)
+        self,
+        combine_input: CombineInput,
+        overlap_args: Optional[CombineOverlapArgs] = None,
+    ) -> Tuple:
+        self.combine_a(combine_input, overlap_args)
        ret = self.combine_b()
        return ret
    def combine_a(
        self,
-        hidden_states: torch.Tensor,
+        combine_input: CombineInput,
-        topk_ids: torch.Tensor,
+        overlap_args: Optional[CombineOverlapArgs] = None,
-        topk_weights: torch.Tensor,
-        overlap_args: Optional = None,
    ):
+        hidden_states, topk_ids, topk_weights = combine_input
        self._update_stage(_Stage.AFTER_DISPATCH_B, _Stage.AFTER_COMBINE_A)
        inner_state = self._get_impl().combine_a(
            hidden_states=hidden_states,
            topk_ids=topk_ids,
            topk_weights=topk_weights,
+            overlap_args=overlap_args,
        )
        self._combine_intermediate_state = inner_state

--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -962,9 +962,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
        moe_runner_backend = get_moe_runner_backend()
        if moe_runner_backend.is_auto():
-            if (
+            if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and (
-                deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+                get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake()
-                and get_moe_a2a_backend().is_deepep()
            ):
                moe_runner_backend = MoeRunnerBackend.DEEP_GEMM
            else: