[5/N] MoE Refactor: Update MoE parallelism arguments (#8658)

6c88f6c8 · Cheng Wan · GitHub · c8d3a402 · 6c88f6c8 · 6c88f6c8
Unverified Commit 6c88f6c8 authored Aug 01, 2025 by Cheng Wan Committed by GitHub Aug 01, 2025
18 changed files
--- a/python/sglang/srt/models/grok.py
+++ b/python/sglang/srt/models/grok.py
@@ -29,6 +29,7 @@ from torch import nn
 from transformers import PretrainedConfig

 from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
    get_tensor_model_parallel_rank,
    get_tensor_model_parallel_world_size,
    tensor_model_parallel_all_gather,
@@ -117,7 +118,7 @@ class Grok1MoE(nn.Module):
        )

        kwargs = {}
-        if global_server_args_dict["enable_ep_moe"]:
+        if get_moe_expert_parallel_world_size() > 1:
            MoEImpl = EPMoE
        else:
            MoEImpl = FusedMoE
@@ -616,8 +617,7 @@ class Grok1ForCausalLM(nn.Module):

        # Params for weights, fp8 weight scales, fp8 activation scales
        # (param_name, weight_name, expert_id, shard_id)
-        MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
-        expert_params_mapping = MoEImpl.make_expert_params_mapping(
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
            ckpt_gate_proj_name="w1",
            ckpt_down_proj_name="w2",
            ckpt_up_proj_name="w3",

--- a/python/sglang/srt/models/mixtral.py
+++ b/python/sglang/srt/models/mixtral.py
@@ -24,6 +24,7 @@ from torch import nn
 from transformers import MixtralConfig

 from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
    get_pp_group,
    get_tensor_model_parallel_world_size,
    tensor_model_parallel_all_reduce,
@@ -94,7 +95,7 @@ class MixtralMoE(nn.Module):
            renormalize=True,
        )

-        MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
+        MoEImpl = EPMoE if get_moe_expert_parallel_world_size() > 1 else FusedMoE
        self.experts = MoEImpl(
            num_experts=num_experts,
            top_k=top_k,
@@ -398,8 +399,7 @@ class MixtralForCausalLM(nn.Module):

        # Params for weights, fp8 weight scales, fp8 activation scales
        # (param_name, weight_name, expert_id, shard_id)
-        MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
-        expert_params_mapping = MoEImpl.make_expert_params_mapping(
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
            ckpt_gate_proj_name="w1",
            ckpt_down_proj_name="w2",
            ckpt_up_proj_name="w3",

--- a/python/sglang/srt/models/qwen2_moe.py
+++ b/python/sglang/srt/models/qwen2_moe.py
@@ -148,7 +148,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
            **(
                dict(
                    enable_flashinfer_cutlass_moe=True,
-                    enable_ep_moe=global_server_args_dict["enable_ep_moe"],
                )
                if global_server_args_dict["enable_flashinfer_cutlass_moe"]
                else {}
@@ -616,9 +615,7 @@ class Qwen2MoeForCausalLM(nn.Module):
            ("gate_up_proj", "up_proj", 1),
        ]

-        MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
-
-        expert_params_mapping = MoEImpl.make_expert_params_mapping(
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
            ckpt_gate_proj_name="gate_proj",
            ckpt_down_proj_name="down_proj",
            ckpt_up_proj_name="up_proj",

--- a/python/sglang/srt/models/qwen3_moe.py
+++ b/python/sglang/srt/models/qwen3_moe.py
@@ -24,6 +24,7 @@ import torch
 from torch import nn

 from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
    get_pp_group,
    get_tensor_model_parallel_rank,
    get_tensor_model_parallel_world_size,
@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
 from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
-from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 from sglang.srt.layers.moe.topk import TopK
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
 from sglang.srt.models.qwen2_moe import Qwen2MoeModel
 from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
-from sglang.srt.utils import DeepEPMode, add_prefix, is_cuda, is_non_idle_and_non_empty
+from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty

 Qwen3MoeConfig = None

@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
            quant_config=quant_config,
            prefix=add_prefix("experts", prefix),
            **(
-                dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
-                if global_server_args_dict["enable_deepep_moe"]
+                dict(deepep_mode=global_server_args_dict["deepep_mode"])
+                if global_server_args_dict["moe_a2a_backend"].is_deepep()
                else {}
            ),
            # Additional args for FusedMoE
            **(
                dict(
                    enable_flashinfer_cutlass_moe=True,
-                    enable_ep_moe=global_server_args_dict["enable_ep_moe"],
                )
                if global_server_args_dict["enable_flashinfer_cutlass_moe"]
                else {}
@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
            prefix=add_prefix("gate", prefix),
        )

-        if global_server_args_dict["enable_deepep_moe"]:
+        if global_server_args_dict["moe_a2a_backend"].is_deepep():
            # TODO: we will support tp < ep in the future
-            self.ep_size = get_tensor_model_parallel_world_size()
+            self.ep_size = get_moe_expert_parallel_world_size()
            self.num_experts = (
                config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
            )
@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
    ) -> torch.Tensor:

-        if not global_server_args_dict["enable_deepep_moe"]:
+        if not global_server_args_dict["moe_a2a_backend"].is_deepep():
            return self.forward_normal(hidden_states)
        else:
            return self.forward_deepep(hidden_states, forward_batch)

--- a/python/sglang/srt/models/step3_vl.py
+++ b/python/sglang/srt/models/step3_vl.py
@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
            prefix=add_prefix("gate", prefix),
        )

-        if global_server_args_dict["enable_deepep_moe"]:
+        if global_server_args_dict["moe_a2a_backend"].is_deepep():
            raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:

--- a/python/sglang/srt/operations_strategy.py
+++ b/python/sglang/srt/operations_strategy.py
@@ -4,7 +4,7 @@ from typing import List, Optional
 import torch

 from sglang.srt import operations
-from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
+from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.operations import Operation


--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -172,12 +172,11 @@ class ServerArgs:

    # Expert parallelism
    ep_size: int = 1
-    enable_ep_moe: bool = False
-    enable_deepep_moe: bool = False
+    moe_a2a_backend: Optional[Literal["deepep"]] = None
    enable_flashinfer_cutlass_moe: bool = False
    enable_flashinfer_trtllm_moe: bool = False
    enable_flashinfer_allreduce_fusion: bool = False
-    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
+    deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
    ep_num_redundant_experts: int = 0
    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
    init_expert_location: str = "trivial"
@@ -272,7 +271,27 @@ class ServerArgs:
    enable_pdmux: bool = False
    sm_group_num: int = 3

+    # Deprecated arguments
+    enable_ep_moe: bool = False
+    enable_deepep_moe: bool = False
+
    def __post_init__(self):
+
+        # Check deprecated arguments
+        def print_deprecated_warning(message: str):
+            logger.warning(f"\033[33m{message}\033[0m")
+
+        if self.enable_ep_moe:
+            self.ep_size = self.tp_size
+            print_deprecated_warning(
+                "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
+            )
+        if self.enable_deepep_moe:
+            self.moe_a2a_backend = "deepep"
+            print_deprecated_warning(
+                "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
+            )
+
        # Set missing default values
        if self.tokenizer_path is None:
            self.tokenizer_path = self.model_path
@@ -455,14 +474,13 @@ class ServerArgs:
                self.quantization == "modelopt_fp4"
            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
            os.environ["TRTLLM_ENABLE_PDL"] = "1"
-            if self.enable_ep_moe:
-                self.ep_size = self.tp_size
-                logger.warning(
-                    f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
-                )
+            assert self.ep_size in [
+                1,
+                self.tp_size,
+            ], "The expert parallel size must be 1 or the same as the tensor parallel size"

        # DeepEP MoE
-        if self.enable_deepep_moe:
+        if self.moe_a2a_backend == "deepep":
            if self.deepep_mode == "normal":
                logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
                self.disable_cuda_graph = True
@@ -486,7 +504,7 @@ class ServerArgs:
            )

        if self.enable_eplb:
-            assert self.enable_ep_moe or self.enable_deepep_moe
+            assert self.ep_size > 1 or self.moe_a2a_backend is not None

        if self.enable_expert_distribution_metrics and (
            self.expert_distribution_recorder_mode is None
@@ -1354,30 +1372,27 @@ class ServerArgs:
            help="The expert parallelism size.",
        )
        parser.add_argument(
-            "--enable-ep-moe",
-            action="store_true",
-            help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
+            "--moe-a2a-backend",
+            type=str,
+            choices=["deepep"],
+            default=ServerArgs.moe_a2a_backend,
+            help="Choose the backend for MoE A2A.",
        )
        parser.add_argument(
            "--enable-flashinfer-cutlass-moe",
            action="store_true",
-            help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
+            help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
        )
        parser.add_argument(
            "--enable-flashinfer-trtllm-moe",
            action="store_true",
-            help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
+            help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
        )
        parser.add_argument(
            "--enable-flashinfer-allreduce-fusion",
            action="store_true",
            help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
        )
-        parser.add_argument(
-            "--enable-deepep-moe",
-            action="store_true",
-            help="Enabling DeepEP MoE implementation for EP MoE.",
-        )
        parser.add_argument(
            "--deepep-mode",
            type=str,
@@ -1839,6 +1854,18 @@ class ServerArgs:
            help="Disable mmap while loading weight using safetensors.",
        )

+        # Deprecated arguments
+        parser.add_argument(
+            "--enable-ep-moe",
+            action="store_true",
+            help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
+        )
+        parser.add_argument(
+            "--enable-deepep-moe",
+            action="store_true",
+            help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
+        )
+
    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        args.tp_size = args.tensor_parallel_size

--- a/python/sglang/srt/two_batch_overlap.py
+++ b/python/sglang/srt/two_batch_overlap.py
@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import (
    CommunicateSummableTensorPairFn,
    ScatterMode,
 )
-from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
+from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
+from sglang.srt.layers.moe.utils import DeepEPMode
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.operations import execute_operations, execute_overlapped_operations
 from sglang.srt.operations_strategy import OperationsStrategy
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
+from sglang.srt.utils import BumpAllocator, get_bool_env_var

 if TYPE_CHECKING:
-    from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
+    from sglang.srt.layers.moe.token_dispatcher import DispatchOutput

 _tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")

@@ -310,7 +311,7 @@ class TboDPAttentionPreparer:
                    and not local_batch.forward_mode.is_target_verify()
                )
                and enable_deepep_moe
-                and (resolved_deepep_mode == DeepEPMode.low_latency)
+                and (resolved_deepep_mode == DeepEPMode.LOW_LATENCY)
            )
        else:
            self.local_tbo_split_seq_index = 0

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
        return [nested_list]


-class DeepEPMode(Enum):
-    normal = "normal"
-    low_latency = "low_latency"
-    auto = "auto"
-
-    def enable_normal(self):
-        return self in [DeepEPMode.normal, DeepEPMode.auto]
-
-    def enable_low_latency(self):
-        return self in [DeepEPMode.low_latency, DeepEPMode.auto]
-
-    def resolve(self, is_extend_in_batch: bool):
-        if self != DeepEPMode.auto:
-            return self
-
-        if is_extend_in_batch:
-            return DeepEPMode.normal
-        else:
-            return DeepEPMode.low_latency
-
-
 def is_non_idle_and_non_empty(forward_mode, hidden_states):
    return (
        (forward_mode is not None)
@@ -2414,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
            return True
        elif not server_args.enable_dp_lm_head:
            return True
-        elif not server_args.enable_deepep_moe:
+        elif server_args.moe_a2a_backend is None:
            return True
        else:
            return (
@@ -2430,7 +2409,7 @@ def require_attn_tp_gather(server_args):
    Check if the input of attention is scattered.
    """
    assert server_args.moe_dense_tp_size in [1, None]
-    if server_args.enable_deepep_moe or server_args.moe_dense_tp_size == 1:
+    if server_args.moe_a2a_backend is not None or server_args.moe_dense_tp_size == 1:
        if server_args.enable_dp_attention:
            return server_args.dp_size < server_args.tp_size
        else:

--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -499,7 +499,6 @@ class SRTRunner:
        chunked_prefill_size: Optional[int] = None,
        dp_size: int = 1,
        tokenizer_path: Optional[str] = None,
-        enable_ep_moe: bool = False,
        mem_fraction_static: float = 0.65,
        trust_remote_code: bool = False,
        speculative_draft_model_path: Optional[str] = None,
@@ -550,7 +549,6 @@ class SRTRunner:
            enable_dp_attention=enable_dp_attention,
            dp_size=dp_size,
            tokenizer_path=tokenizer_path,
-            enable_ep_moe=enable_ep_moe,
            disable_overlap_schedule=disable_overlap_schedule,
            cuda_graph_max_bs=cuda_graph_max_bs,
            disable_custom_all_reduce=disable_custom_all_reduce,

--- a/test/srt/test_deepep_large.py
+++ b/test/srt/test_deepep_large.py
@@ -33,7 +33,8 @@ class TestDeepseek(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--enable-two-batch-overlap",
                "--ep-num-redundant-experts",
                "32",
@@ -88,7 +89,8 @@ class TestDeepseekMTP(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--enable-two-batch-overlap",
                "--ep-num-redundant-experts",
                "32",

--- a/test/srt/test_deepep_small.py
+++ b/test/srt/test_deepep_small.py
@@ -31,7 +31,8 @@ class TestPureDP(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "4",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
                "--max-running-requests",
@@ -77,7 +78,8 @@ class TestHybridDPTP(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "2",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
                "--max-running-requests",
@@ -118,7 +120,8 @@ class TestTP(CustomTestCase):
                "--trust-remote-code",
                "--tp",
                "4",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
                "--max-running-requests",
@@ -166,7 +169,8 @@ class TestNoGatherdBuffer(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -212,7 +216,8 @@ class TestTBO(CustomTestCase):
                "4",
                "--moe-dense-tp-size",
                "1",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--enable-two-batch-overlap",
                "--cuda-graph-max-bs",
                "128",
@@ -259,7 +264,8 @@ class TestMTP(CustomTestCase):
                "--dp",
                "2",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -326,7 +332,8 @@ class TestMTPWithTBO(CustomTestCase):
                "--dp-size",
                "4",
                "--enable-two-batch-overlap",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--trust-remote-code",
                "--speculative-algorithm",
                "EAGLE",

--- a/test/srt/test_eplb.py
+++ b/test/srt/test_eplb.py
@@ -34,7 +34,8 @@ class _BaseTestDynamicEPLB(CustomTestCase):
                "--dp",
                "2",
                "--enable-dp-attention",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--deepep-mode",
                "normal",
                "--disable-cuda-graph",
@@ -96,8 +97,7 @@ class TestStaticEPLB(CustomTestCase):
                trust_remote_code=True,
                ep_num_redundant_experts=4,
                enable_dp_attention=True,
-                enable_deepep_moe=True,
-                deepep_mode="normal",
+                moe_a2a_backend="deepep",
                disable_cuda_graph=True,
                expert_distribution_recorder_mode="stat",
                tp_size=2,

--- a/test/srt/test_hybrid_dp_ep_tp_mtp.py
+++ b/test/srt/test_hybrid_dp_ep_tp_mtp.py
@@ -407,9 +407,8 @@ class Test10(CustomTestCase):
                "--trust-remote-code",
                "--tp",
                "8",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -449,9 +448,8 @@ class Test11(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "4",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -491,9 +489,8 @@ class Test12(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "8",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -532,9 +529,8 @@ class Test13(CustomTestCase):
                "8",
                "--moe-dense-tp-size",
                "1",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -576,9 +572,8 @@ class Test14(CustomTestCase):
                "4",
                "--moe-dense-tp-size",
                "1",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -620,9 +615,8 @@ class Test15(CustomTestCase):
                "8",
                "--moe-dense-tp-size",
                "1",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -663,9 +657,8 @@ class Test16(CustomTestCase):
                "--dp",
                "4",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -706,9 +699,8 @@ class Test17(CustomTestCase):
                "--dp",
                "8",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -751,9 +743,8 @@ class Test18(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -796,9 +787,8 @@ class Test19(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],
@@ -835,7 +825,8 @@ class Test20(CustomTestCase):
                "--trust-remote-code",
                "--tp",
                "8",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -873,7 +864,8 @@ class Test21(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "4",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -911,7 +903,8 @@ class Test22(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "8",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -948,7 +941,8 @@ class Test23(CustomTestCase):
                "8",
                "--moe-dense-tp-size",
                "1",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -988,7 +982,8 @@ class Test24(CustomTestCase):
                "4",
                "--moe-dense-tp-size",
                "1",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -1028,7 +1023,8 @@ class Test25(CustomTestCase):
                "8",
                "--moe-dense-tp-size",
                "1",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -1067,7 +1063,8 @@ class Test26(CustomTestCase):
                "--dp",
                "4",
                "--enable-dp-lm-head",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -1106,7 +1103,8 @@ class Test27(CustomTestCase):
                "--dp",
                "8",
                "--enable-dp-lm-head",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -1147,7 +1145,8 @@ class Test28(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -1188,7 +1187,8 @@ class Test29(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
            ],
        )

@@ -1701,9 +1701,8 @@ class Test40(CustomTestCase):
                "--trust-remote-code",
                "--tp",
                "8",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -1755,9 +1754,8 @@ class Test41(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "4",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -1809,9 +1807,8 @@ class Test42(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "8",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -1862,9 +1859,8 @@ class Test43(CustomTestCase):
                "8",
                "--moe-dense-tp-size",
                "1",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -1918,9 +1914,8 @@ class Test44(CustomTestCase):
                "4",
                "--moe-dense-tp-size",
                "1",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -1974,9 +1969,8 @@ class Test45(CustomTestCase):
                "8",
                "--moe-dense-tp-size",
                "1",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -2029,9 +2023,8 @@ class Test46(CustomTestCase):
                "--dp",
                "4",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -2084,9 +2077,8 @@ class Test47(CustomTestCase):
                "--dp",
                "8",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -2141,9 +2133,8 @@ class Test48(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -2198,9 +2189,8 @@ class Test49(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-deepep-moe",
-                "--deepep-mode",
-                "auto",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "32",
                "--max-running-requests",
@@ -2249,7 +2239,8 @@ class Test50(CustomTestCase):
                "--trust-remote-code",
                "--tp",
                "8",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -2297,7 +2288,8 @@ class Test51(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "4",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -2345,7 +2337,8 @@ class Test52(CustomTestCase):
                "--enable-dp-attention",
                "--dp",
                "8",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -2392,7 +2385,8 @@ class Test53(CustomTestCase):
                "8",
                "--moe-dense-tp-size",
                "1",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -2442,7 +2436,8 @@ class Test54(CustomTestCase):
                "4",
                "--moe-dense-tp-size",
                "1",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -2492,7 +2487,8 @@ class Test55(CustomTestCase):
                "8",
                "--moe-dense-tp-size",
                "1",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -2541,7 +2537,8 @@ class Test56(CustomTestCase):
                "--dp",
                "4",
                "--enable-dp-lm-head",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -2590,7 +2587,8 @@ class Test57(CustomTestCase):
                "--dp",
                "8",
                "--enable-dp-lm-head",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -2641,7 +2639,8 @@ class Test58(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",
@@ -2692,7 +2691,8 @@ class Test59(CustomTestCase):
                "--moe-dense-tp-size",
                "1",
                "--enable-dp-lm-head",
-                "--enable-ep-moe",
+                "--ep",
+                "8",
                "--speculative-algo",
                "EAGLE",
                "--speculative-draft",

--- a/test/srt/test_moe_deepep.py
+++ b/test/srt/test_moe_deepep.py
@@ -27,7 +27,8 @@ class TestPureTP(CustomTestCase):
                "--trust-remote-code",
                "--tp",
                "2",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--disable-cuda-graph",
            ],
        )
@@ -65,7 +66,8 @@ class TestDPAttn(unittest.TestCase):
                "--dp",
                "2",
                "--enable-dp-attention",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--deepep-mode",
                "normal",
                "--disable-cuda-graph",

--- a/test/srt/test_moe_deepep_eval_accuracy_large.py
+++ b/test/srt/test_moe_deepep_eval_accuracy_large.py
@@ -31,7 +31,8 @@ class TestMoEDeepEPEvalAccuracyLarge(CustomTestCase):
                "--trust-remote-code",
                "--tp",
                "8",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--cuda-graph-max-bs",
                "128",
            ],

--- a/test/srt/test_moe_ep.py
+++ b/test/srt/test_moe_ep.py
@@ -27,7 +27,6 @@ class TestEpMoE(CustomTestCase):
                "2",
                "--ep-size",
                "2",
-                "--enable-ep-moe",
            ],
        )

@@ -75,7 +74,6 @@ class TestEpMoEFP8(CustomTestCase):
                "2",
                "--ep-size",
                "2",
-                "--enable-ep-moe",
                "--quantization",
                "fp8",
            ],

--- a/test/srt/test_two_batch_overlap.py
+++ b/test/srt/test_two_batch_overlap.py
@@ -33,7 +33,8 @@ class TestTwoBatchOverlap(unittest.TestCase):
                "--dp",
                "2",
                "--enable-dp-attention",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--deepep-mode",
                "normal",
                "--disable-cuda-graph",  # DeepEP normal does not support CUDA Graph
@@ -122,7 +123,8 @@ class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
                "--dp",
                "2",
                "--enable-dp-attention",
-                "--enable-deepep-moe",
+                "--moe-a2a-backend",
+                "deepep",
                "--deepep-mode",
                "normal",
                "--disable-cuda-graph",  # DeepEP normal does not support CUDA Graph