Unverified Commit 6c88f6c8 authored by Cheng Wan's avatar Cheng Wan Committed by GitHub
Browse files

[5/N] MoE Refactor: Update MoE parallelism arguments (#8658)

parent c8d3a402
...@@ -29,6 +29,7 @@ from torch import nn ...@@ -29,6 +29,7 @@ from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from sglang.srt.distributed import ( from sglang.srt.distributed import (
get_moe_expert_parallel_world_size,
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
tensor_model_parallel_all_gather, tensor_model_parallel_all_gather,
...@@ -117,7 +118,7 @@ class Grok1MoE(nn.Module): ...@@ -117,7 +118,7 @@ class Grok1MoE(nn.Module):
) )
kwargs = {} kwargs = {}
if global_server_args_dict["enable_ep_moe"]: if get_moe_expert_parallel_world_size() > 1:
MoEImpl = EPMoE MoEImpl = EPMoE
else: else:
MoEImpl = FusedMoE MoEImpl = FusedMoE
...@@ -616,8 +617,7 @@ class Grok1ForCausalLM(nn.Module): ...@@ -616,8 +617,7 @@ class Grok1ForCausalLM(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE expert_params_mapping = FusedMoE.make_expert_params_mapping(
expert_params_mapping = MoEImpl.make_expert_params_mapping(
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3", ckpt_up_proj_name="w3",
......
...@@ -24,6 +24,7 @@ from torch import nn ...@@ -24,6 +24,7 @@ from torch import nn
from transformers import MixtralConfig from transformers import MixtralConfig
from sglang.srt.distributed import ( from sglang.srt.distributed import (
get_moe_expert_parallel_world_size,
get_pp_group, get_pp_group,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
tensor_model_parallel_all_reduce, tensor_model_parallel_all_reduce,
...@@ -94,7 +95,7 @@ class MixtralMoE(nn.Module): ...@@ -94,7 +95,7 @@ class MixtralMoE(nn.Module):
renormalize=True, renormalize=True,
) )
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE MoEImpl = EPMoE if get_moe_expert_parallel_world_size() > 1 else FusedMoE
self.experts = MoEImpl( self.experts = MoEImpl(
num_experts=num_experts, num_experts=num_experts,
top_k=top_k, top_k=top_k,
...@@ -398,8 +399,7 @@ class MixtralForCausalLM(nn.Module): ...@@ -398,8 +399,7 @@ class MixtralForCausalLM(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales # Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id) # (param_name, weight_name, expert_id, shard_id)
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE expert_params_mapping = FusedMoE.make_expert_params_mapping(
expert_params_mapping = MoEImpl.make_expert_params_mapping(
ckpt_gate_proj_name="w1", ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2", ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3", ckpt_up_proj_name="w3",
......
...@@ -148,7 +148,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module): ...@@ -148,7 +148,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
**( **(
dict( dict(
enable_flashinfer_cutlass_moe=True, enable_flashinfer_cutlass_moe=True,
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
) )
if global_server_args_dict["enable_flashinfer_cutlass_moe"] if global_server_args_dict["enable_flashinfer_cutlass_moe"]
else {} else {}
...@@ -616,9 +615,7 @@ class Qwen2MoeForCausalLM(nn.Module): ...@@ -616,9 +615,7 @@ class Qwen2MoeForCausalLM(nn.Module):
("gate_up_proj", "up_proj", 1), ("gate_up_proj", "up_proj", 1),
] ]
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE expert_params_mapping = FusedMoE.make_expert_params_mapping(
expert_params_mapping = MoEImpl.make_expert_params_mapping(
ckpt_gate_proj_name="gate_proj", ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj", ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj", ckpt_up_proj_name="up_proj",
......
...@@ -24,6 +24,7 @@ import torch ...@@ -24,6 +24,7 @@ import torch
from torch import nn from torch import nn
from sglang.srt.distributed import ( from sglang.srt.distributed import (
get_moe_expert_parallel_world_size,
get_pp_group, get_pp_group,
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
...@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import ( ...@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
) )
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
from sglang.srt.layers.moe.topk import TopK from sglang.srt.layers.moe.topk import TopK
from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.quantization.base_config import QuantizationConfig
from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.radix_attention import RadixAttention
...@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader ...@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
from sglang.srt.models.qwen2_moe import Qwen2MoeModel from sglang.srt.models.qwen2_moe import Qwen2MoeModel
from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
from sglang.srt.utils import DeepEPMode, add_prefix, is_cuda, is_non_idle_and_non_empty from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty
Qwen3MoeConfig = None Qwen3MoeConfig = None
...@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module): ...@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
quant_config=quant_config, quant_config=quant_config,
prefix=add_prefix("experts", prefix), prefix=add_prefix("experts", prefix),
**( **(
dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]]) dict(deepep_mode=global_server_args_dict["deepep_mode"])
if global_server_args_dict["enable_deepep_moe"] if global_server_args_dict["moe_a2a_backend"].is_deepep()
else {} else {}
), ),
# Additional args for FusedMoE # Additional args for FusedMoE
**( **(
dict( dict(
enable_flashinfer_cutlass_moe=True, enable_flashinfer_cutlass_moe=True,
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
) )
if global_server_args_dict["enable_flashinfer_cutlass_moe"] if global_server_args_dict["enable_flashinfer_cutlass_moe"]
else {} else {}
...@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module): ...@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
prefix=add_prefix("gate", prefix), prefix=add_prefix("gate", prefix),
) )
if global_server_args_dict["enable_deepep_moe"]: if global_server_args_dict["moe_a2a_backend"].is_deepep():
# TODO: we will support tp < ep in the future # TODO: we will support tp < ep in the future
self.ep_size = get_tensor_model_parallel_world_size() self.ep_size = get_moe_expert_parallel_world_size()
self.num_experts = ( self.num_experts = (
config.num_experts + global_server_args_dict["ep_num_redundant_experts"] config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
) )
...@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module): ...@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
) -> torch.Tensor: ) -> torch.Tensor:
if not global_server_args_dict["enable_deepep_moe"]: if not global_server_args_dict["moe_a2a_backend"].is_deepep():
return self.forward_normal(hidden_states) return self.forward_normal(hidden_states)
else: else:
return self.forward_deepep(hidden_states, forward_batch) return self.forward_deepep(hidden_states, forward_batch)
......
...@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module): ...@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
prefix=add_prefix("gate", prefix), prefix=add_prefix("gate", prefix),
) )
if global_server_args_dict["enable_deepep_moe"]: if global_server_args_dict["moe_a2a_backend"].is_deepep():
raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.") raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
......
...@@ -4,7 +4,7 @@ from typing import List, Optional ...@@ -4,7 +4,7 @@ from typing import List, Optional
import torch import torch
from sglang.srt import operations from sglang.srt import operations
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
from sglang.srt.model_executor.forward_batch_info import ForwardMode from sglang.srt.model_executor.forward_batch_info import ForwardMode
from sglang.srt.operations import Operation from sglang.srt.operations import Operation
......
...@@ -172,12 +172,11 @@ class ServerArgs: ...@@ -172,12 +172,11 @@ class ServerArgs:
# Expert parallelism # Expert parallelism
ep_size: int = 1 ep_size: int = 1
enable_ep_moe: bool = False moe_a2a_backend: Optional[Literal["deepep"]] = None
enable_deepep_moe: bool = False
enable_flashinfer_cutlass_moe: bool = False enable_flashinfer_cutlass_moe: bool = False
enable_flashinfer_trtllm_moe: bool = False enable_flashinfer_trtllm_moe: bool = False
enable_flashinfer_allreduce_fusion: bool = False enable_flashinfer_allreduce_fusion: bool = False
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto" deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
ep_num_redundant_experts: int = 0 ep_num_redundant_experts: int = 0
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
init_expert_location: str = "trivial" init_expert_location: str = "trivial"
...@@ -272,7 +271,27 @@ class ServerArgs: ...@@ -272,7 +271,27 @@ class ServerArgs:
enable_pdmux: bool = False enable_pdmux: bool = False
sm_group_num: int = 3 sm_group_num: int = 3
# Deprecated arguments
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
def __post_init__(self): def __post_init__(self):
# Check deprecated arguments
def print_deprecated_warning(message: str):
logger.warning(f"\033[33m{message}\033[0m")
if self.enable_ep_moe:
self.ep_size = self.tp_size
print_deprecated_warning(
"NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
)
if self.enable_deepep_moe:
self.moe_a2a_backend = "deepep"
print_deprecated_warning(
"NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
)
# Set missing default values # Set missing default values
if self.tokenizer_path is None: if self.tokenizer_path is None:
self.tokenizer_path = self.model_path self.tokenizer_path = self.model_path
...@@ -455,14 +474,13 @@ class ServerArgs: ...@@ -455,14 +474,13 @@ class ServerArgs:
self.quantization == "modelopt_fp4" self.quantization == "modelopt_fp4"
), "modelopt_fp4 quantization is required for Flashinfer MOE" ), "modelopt_fp4 quantization is required for Flashinfer MOE"
os.environ["TRTLLM_ENABLE_PDL"] = "1" os.environ["TRTLLM_ENABLE_PDL"] = "1"
if self.enable_ep_moe: assert self.ep_size in [
self.ep_size = self.tp_size 1,
logger.warning( self.tp_size,
f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]." ], "The expert parallel size must be 1 or the same as the tensor parallel size"
)
# DeepEP MoE # DeepEP MoE
if self.enable_deepep_moe: if self.moe_a2a_backend == "deepep":
if self.deepep_mode == "normal": if self.deepep_mode == "normal":
logger.warning("Cuda graph is disabled because deepep_mode=`normal`") logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
self.disable_cuda_graph = True self.disable_cuda_graph = True
...@@ -486,7 +504,7 @@ class ServerArgs: ...@@ -486,7 +504,7 @@ class ServerArgs:
) )
if self.enable_eplb: if self.enable_eplb:
assert self.enable_ep_moe or self.enable_deepep_moe assert self.ep_size > 1 or self.moe_a2a_backend is not None
if self.enable_expert_distribution_metrics and ( if self.enable_expert_distribution_metrics and (
self.expert_distribution_recorder_mode is None self.expert_distribution_recorder_mode is None
...@@ -1354,30 +1372,27 @@ class ServerArgs: ...@@ -1354,30 +1372,27 @@ class ServerArgs:
help="The expert parallelism size.", help="The expert parallelism size.",
) )
parser.add_argument( parser.add_argument(
"--enable-ep-moe", "--moe-a2a-backend",
action="store_true", type=str,
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.", choices=["deepep"],
default=ServerArgs.moe_a2a_backend,
help="Choose the backend for MoE A2A.",
) )
parser.add_argument( parser.add_argument(
"--enable-flashinfer-cutlass-moe", "--enable-flashinfer-cutlass-moe",
action="store_true", action="store_true",
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe", help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
) )
parser.add_argument( parser.add_argument(
"--enable-flashinfer-trtllm-moe", "--enable-flashinfer-trtllm-moe",
action="store_true", action="store_true",
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe", help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
) )
parser.add_argument( parser.add_argument(
"--enable-flashinfer-allreduce-fusion", "--enable-flashinfer-allreduce-fusion",
action="store_true", action="store_true",
help="Enable FlashInfer allreduce fusion for Add_RMSNorm.", help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
) )
parser.add_argument(
"--enable-deepep-moe",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
parser.add_argument( parser.add_argument(
"--deepep-mode", "--deepep-mode",
type=str, type=str,
...@@ -1839,6 +1854,18 @@ class ServerArgs: ...@@ -1839,6 +1854,18 @@ class ServerArgs:
help="Disable mmap while loading weight using safetensors.", help="Disable mmap while loading weight using safetensors.",
) )
# Deprecated arguments
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",
help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
)
@classmethod @classmethod
def from_cli_args(cls, args: argparse.Namespace): def from_cli_args(cls, args: argparse.Namespace):
args.tp_size = args.tensor_parallel_size args.tp_size = args.tensor_parallel_size
......
...@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import ( ...@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import (
CommunicateSummableTensorPairFn, CommunicateSummableTensorPairFn,
ScatterMode, ScatterMode,
) )
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
from sglang.srt.layers.moe.utils import DeepEPMode
from sglang.srt.layers.quantization import deep_gemm_wrapper from sglang.srt.layers.quantization import deep_gemm_wrapper
from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
from sglang.srt.operations import execute_operations, execute_overlapped_operations from sglang.srt.operations import execute_operations, execute_overlapped_operations
from sglang.srt.operations_strategy import OperationsStrategy from sglang.srt.operations_strategy import OperationsStrategy
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var from sglang.srt.utils import BumpAllocator, get_bool_env_var
if TYPE_CHECKING: if TYPE_CHECKING:
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
_tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG") _tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
...@@ -310,7 +311,7 @@ class TboDPAttentionPreparer: ...@@ -310,7 +311,7 @@ class TboDPAttentionPreparer:
and not local_batch.forward_mode.is_target_verify() and not local_batch.forward_mode.is_target_verify()
) )
and enable_deepep_moe and enable_deepep_moe
and (resolved_deepep_mode == DeepEPMode.low_latency) and (resolved_deepep_mode == DeepEPMode.LOW_LATENCY)
) )
else: else:
self.local_tbo_split_seq_index = 0 self.local_tbo_split_seq_index = 0
......
...@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list): ...@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
return [nested_list] return [nested_list]
class DeepEPMode(Enum):
normal = "normal"
low_latency = "low_latency"
auto = "auto"
def enable_normal(self):
return self in [DeepEPMode.normal, DeepEPMode.auto]
def enable_low_latency(self):
return self in [DeepEPMode.low_latency, DeepEPMode.auto]
def resolve(self, is_extend_in_batch: bool):
if self != DeepEPMode.auto:
return self
if is_extend_in_batch:
return DeepEPMode.normal
else:
return DeepEPMode.low_latency
def is_non_idle_and_non_empty(forward_mode, hidden_states): def is_non_idle_and_non_empty(forward_mode, hidden_states):
return ( return (
(forward_mode is not None) (forward_mode is not None)
...@@ -2414,7 +2393,7 @@ def require_mlp_tp_gather(server_args): ...@@ -2414,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
return True return True
elif not server_args.enable_dp_lm_head: elif not server_args.enable_dp_lm_head:
return True return True
elif not server_args.enable_deepep_moe: elif server_args.moe_a2a_backend is None:
return True return True
else: else:
return ( return (
...@@ -2430,7 +2409,7 @@ def require_attn_tp_gather(server_args): ...@@ -2430,7 +2409,7 @@ def require_attn_tp_gather(server_args):
Check if the input of attention is scattered. Check if the input of attention is scattered.
""" """
assert server_args.moe_dense_tp_size in [1, None] assert server_args.moe_dense_tp_size in [1, None]
if server_args.enable_deepep_moe or server_args.moe_dense_tp_size == 1: if server_args.moe_a2a_backend is not None or server_args.moe_dense_tp_size == 1:
if server_args.enable_dp_attention: if server_args.enable_dp_attention:
return server_args.dp_size < server_args.tp_size return server_args.dp_size < server_args.tp_size
else: else:
......
...@@ -499,7 +499,6 @@ class SRTRunner: ...@@ -499,7 +499,6 @@ class SRTRunner:
chunked_prefill_size: Optional[int] = None, chunked_prefill_size: Optional[int] = None,
dp_size: int = 1, dp_size: int = 1,
tokenizer_path: Optional[str] = None, tokenizer_path: Optional[str] = None,
enable_ep_moe: bool = False,
mem_fraction_static: float = 0.65, mem_fraction_static: float = 0.65,
trust_remote_code: bool = False, trust_remote_code: bool = False,
speculative_draft_model_path: Optional[str] = None, speculative_draft_model_path: Optional[str] = None,
...@@ -550,7 +549,6 @@ class SRTRunner: ...@@ -550,7 +549,6 @@ class SRTRunner:
enable_dp_attention=enable_dp_attention, enable_dp_attention=enable_dp_attention,
dp_size=dp_size, dp_size=dp_size,
tokenizer_path=tokenizer_path, tokenizer_path=tokenizer_path,
enable_ep_moe=enable_ep_moe,
disable_overlap_schedule=disable_overlap_schedule, disable_overlap_schedule=disable_overlap_schedule,
cuda_graph_max_bs=cuda_graph_max_bs, cuda_graph_max_bs=cuda_graph_max_bs,
disable_custom_all_reduce=disable_custom_all_reduce, disable_custom_all_reduce=disable_custom_all_reduce,
......
...@@ -33,7 +33,8 @@ class TestDeepseek(CustomTestCase): ...@@ -33,7 +33,8 @@ class TestDeepseek(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--enable-two-batch-overlap", "--enable-two-batch-overlap",
"--ep-num-redundant-experts", "--ep-num-redundant-experts",
"32", "32",
...@@ -88,7 +89,8 @@ class TestDeepseekMTP(CustomTestCase): ...@@ -88,7 +89,8 @@ class TestDeepseekMTP(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--enable-two-batch-overlap", "--enable-two-batch-overlap",
"--ep-num-redundant-experts", "--ep-num-redundant-experts",
"32", "32",
......
...@@ -31,7 +31,8 @@ class TestPureDP(CustomTestCase): ...@@ -31,7 +31,8 @@ class TestPureDP(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"4", "4",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
"--max-running-requests", "--max-running-requests",
...@@ -77,7 +78,8 @@ class TestHybridDPTP(CustomTestCase): ...@@ -77,7 +78,8 @@ class TestHybridDPTP(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"2", "2",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
"--max-running-requests", "--max-running-requests",
...@@ -118,7 +120,8 @@ class TestTP(CustomTestCase): ...@@ -118,7 +120,8 @@ class TestTP(CustomTestCase):
"--trust-remote-code", "--trust-remote-code",
"--tp", "--tp",
"4", "4",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
"--max-running-requests", "--max-running-requests",
...@@ -166,7 +169,8 @@ class TestNoGatherdBuffer(CustomTestCase): ...@@ -166,7 +169,8 @@ class TestNoGatherdBuffer(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -212,7 +216,8 @@ class TestTBO(CustomTestCase): ...@@ -212,7 +216,8 @@ class TestTBO(CustomTestCase):
"4", "4",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--enable-two-batch-overlap", "--enable-two-batch-overlap",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
...@@ -259,7 +264,8 @@ class TestMTP(CustomTestCase): ...@@ -259,7 +264,8 @@ class TestMTP(CustomTestCase):
"--dp", "--dp",
"2", "2",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -326,7 +332,8 @@ class TestMTPWithTBO(CustomTestCase): ...@@ -326,7 +332,8 @@ class TestMTPWithTBO(CustomTestCase):
"--dp-size", "--dp-size",
"4", "4",
"--enable-two-batch-overlap", "--enable-two-batch-overlap",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--trust-remote-code", "--trust-remote-code",
"--speculative-algorithm", "--speculative-algorithm",
"EAGLE", "EAGLE",
......
...@@ -34,7 +34,8 @@ class _BaseTestDynamicEPLB(CustomTestCase): ...@@ -34,7 +34,8 @@ class _BaseTestDynamicEPLB(CustomTestCase):
"--dp", "--dp",
"2", "2",
"--enable-dp-attention", "--enable-dp-attention",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--deepep-mode", "--deepep-mode",
"normal", "normal",
"--disable-cuda-graph", "--disable-cuda-graph",
...@@ -96,8 +97,7 @@ class TestStaticEPLB(CustomTestCase): ...@@ -96,8 +97,7 @@ class TestStaticEPLB(CustomTestCase):
trust_remote_code=True, trust_remote_code=True,
ep_num_redundant_experts=4, ep_num_redundant_experts=4,
enable_dp_attention=True, enable_dp_attention=True,
enable_deepep_moe=True, moe_a2a_backend="deepep",
deepep_mode="normal",
disable_cuda_graph=True, disable_cuda_graph=True,
expert_distribution_recorder_mode="stat", expert_distribution_recorder_mode="stat",
tp_size=2, tp_size=2,
......
...@@ -407,9 +407,8 @@ class Test10(CustomTestCase): ...@@ -407,9 +407,8 @@ class Test10(CustomTestCase):
"--trust-remote-code", "--trust-remote-code",
"--tp", "--tp",
"8", "8",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -449,9 +448,8 @@ class Test11(CustomTestCase): ...@@ -449,9 +448,8 @@ class Test11(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"4", "4",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -491,9 +489,8 @@ class Test12(CustomTestCase): ...@@ -491,9 +489,8 @@ class Test12(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"8", "8",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -532,9 +529,8 @@ class Test13(CustomTestCase): ...@@ -532,9 +529,8 @@ class Test13(CustomTestCase):
"8", "8",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -576,9 +572,8 @@ class Test14(CustomTestCase): ...@@ -576,9 +572,8 @@ class Test14(CustomTestCase):
"4", "4",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -620,9 +615,8 @@ class Test15(CustomTestCase): ...@@ -620,9 +615,8 @@ class Test15(CustomTestCase):
"8", "8",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -663,9 +657,8 @@ class Test16(CustomTestCase): ...@@ -663,9 +657,8 @@ class Test16(CustomTestCase):
"--dp", "--dp",
"4", "4",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -706,9 +699,8 @@ class Test17(CustomTestCase): ...@@ -706,9 +699,8 @@ class Test17(CustomTestCase):
"--dp", "--dp",
"8", "8",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -751,9 +743,8 @@ class Test18(CustomTestCase): ...@@ -751,9 +743,8 @@ class Test18(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -796,9 +787,8 @@ class Test19(CustomTestCase): ...@@ -796,9 +787,8 @@ class Test19(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
...@@ -835,7 +825,8 @@ class Test20(CustomTestCase): ...@@ -835,7 +825,8 @@ class Test20(CustomTestCase):
"--trust-remote-code", "--trust-remote-code",
"--tp", "--tp",
"8", "8",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -873,7 +864,8 @@ class Test21(CustomTestCase): ...@@ -873,7 +864,8 @@ class Test21(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"4", "4",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -911,7 +903,8 @@ class Test22(CustomTestCase): ...@@ -911,7 +903,8 @@ class Test22(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"8", "8",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -948,7 +941,8 @@ class Test23(CustomTestCase): ...@@ -948,7 +941,8 @@ class Test23(CustomTestCase):
"8", "8",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -988,7 +982,8 @@ class Test24(CustomTestCase): ...@@ -988,7 +982,8 @@ class Test24(CustomTestCase):
"4", "4",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -1028,7 +1023,8 @@ class Test25(CustomTestCase): ...@@ -1028,7 +1023,8 @@ class Test25(CustomTestCase):
"8", "8",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -1067,7 +1063,8 @@ class Test26(CustomTestCase): ...@@ -1067,7 +1063,8 @@ class Test26(CustomTestCase):
"--dp", "--dp",
"4", "4",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -1106,7 +1103,8 @@ class Test27(CustomTestCase): ...@@ -1106,7 +1103,8 @@ class Test27(CustomTestCase):
"--dp", "--dp",
"8", "8",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -1147,7 +1145,8 @@ class Test28(CustomTestCase): ...@@ -1147,7 +1145,8 @@ class Test28(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -1188,7 +1187,8 @@ class Test29(CustomTestCase): ...@@ -1188,7 +1187,8 @@ class Test29(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-ep-moe", "--ep",
"8",
], ],
) )
...@@ -1701,9 +1701,8 @@ class Test40(CustomTestCase): ...@@ -1701,9 +1701,8 @@ class Test40(CustomTestCase):
"--trust-remote-code", "--trust-remote-code",
"--tp", "--tp",
"8", "8",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -1755,9 +1754,8 @@ class Test41(CustomTestCase): ...@@ -1755,9 +1754,8 @@ class Test41(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"4", "4",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -1809,9 +1807,8 @@ class Test42(CustomTestCase): ...@@ -1809,9 +1807,8 @@ class Test42(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"8", "8",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -1862,9 +1859,8 @@ class Test43(CustomTestCase): ...@@ -1862,9 +1859,8 @@ class Test43(CustomTestCase):
"8", "8",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -1918,9 +1914,8 @@ class Test44(CustomTestCase): ...@@ -1918,9 +1914,8 @@ class Test44(CustomTestCase):
"4", "4",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -1974,9 +1969,8 @@ class Test45(CustomTestCase): ...@@ -1974,9 +1969,8 @@ class Test45(CustomTestCase):
"8", "8",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -2029,9 +2023,8 @@ class Test46(CustomTestCase): ...@@ -2029,9 +2023,8 @@ class Test46(CustomTestCase):
"--dp", "--dp",
"4", "4",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -2084,9 +2077,8 @@ class Test47(CustomTestCase): ...@@ -2084,9 +2077,8 @@ class Test47(CustomTestCase):
"--dp", "--dp",
"8", "8",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -2141,9 +2133,8 @@ class Test48(CustomTestCase): ...@@ -2141,9 +2133,8 @@ class Test48(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -2198,9 +2189,8 @@ class Test49(CustomTestCase): ...@@ -2198,9 +2189,8 @@ class Test49(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-deepep-moe", "--moe-a2a-backend",
"--deepep-mode", "deepep",
"auto",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"32", "32",
"--max-running-requests", "--max-running-requests",
...@@ -2249,7 +2239,8 @@ class Test50(CustomTestCase): ...@@ -2249,7 +2239,8 @@ class Test50(CustomTestCase):
"--trust-remote-code", "--trust-remote-code",
"--tp", "--tp",
"8", "8",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -2297,7 +2288,8 @@ class Test51(CustomTestCase): ...@@ -2297,7 +2288,8 @@ class Test51(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"4", "4",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -2345,7 +2337,8 @@ class Test52(CustomTestCase): ...@@ -2345,7 +2337,8 @@ class Test52(CustomTestCase):
"--enable-dp-attention", "--enable-dp-attention",
"--dp", "--dp",
"8", "8",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -2392,7 +2385,8 @@ class Test53(CustomTestCase): ...@@ -2392,7 +2385,8 @@ class Test53(CustomTestCase):
"8", "8",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -2442,7 +2436,8 @@ class Test54(CustomTestCase): ...@@ -2442,7 +2436,8 @@ class Test54(CustomTestCase):
"4", "4",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -2492,7 +2487,8 @@ class Test55(CustomTestCase): ...@@ -2492,7 +2487,8 @@ class Test55(CustomTestCase):
"8", "8",
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -2541,7 +2537,8 @@ class Test56(CustomTestCase): ...@@ -2541,7 +2537,8 @@ class Test56(CustomTestCase):
"--dp", "--dp",
"4", "4",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -2590,7 +2587,8 @@ class Test57(CustomTestCase): ...@@ -2590,7 +2587,8 @@ class Test57(CustomTestCase):
"--dp", "--dp",
"8", "8",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -2641,7 +2639,8 @@ class Test58(CustomTestCase): ...@@ -2641,7 +2639,8 @@ class Test58(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
...@@ -2692,7 +2691,8 @@ class Test59(CustomTestCase): ...@@ -2692,7 +2691,8 @@ class Test59(CustomTestCase):
"--moe-dense-tp-size", "--moe-dense-tp-size",
"1", "1",
"--enable-dp-lm-head", "--enable-dp-lm-head",
"--enable-ep-moe", "--ep",
"8",
"--speculative-algo", "--speculative-algo",
"EAGLE", "EAGLE",
"--speculative-draft", "--speculative-draft",
......
...@@ -27,7 +27,8 @@ class TestPureTP(CustomTestCase): ...@@ -27,7 +27,8 @@ class TestPureTP(CustomTestCase):
"--trust-remote-code", "--trust-remote-code",
"--tp", "--tp",
"2", "2",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--disable-cuda-graph", "--disable-cuda-graph",
], ],
) )
...@@ -65,7 +66,8 @@ class TestDPAttn(unittest.TestCase): ...@@ -65,7 +66,8 @@ class TestDPAttn(unittest.TestCase):
"--dp", "--dp",
"2", "2",
"--enable-dp-attention", "--enable-dp-attention",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--deepep-mode", "--deepep-mode",
"normal", "normal",
"--disable-cuda-graph", "--disable-cuda-graph",
......
...@@ -31,7 +31,8 @@ class TestMoEDeepEPEvalAccuracyLarge(CustomTestCase): ...@@ -31,7 +31,8 @@ class TestMoEDeepEPEvalAccuracyLarge(CustomTestCase):
"--trust-remote-code", "--trust-remote-code",
"--tp", "--tp",
"8", "8",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"128", "128",
], ],
......
...@@ -27,7 +27,6 @@ class TestEpMoE(CustomTestCase): ...@@ -27,7 +27,6 @@ class TestEpMoE(CustomTestCase):
"2", "2",
"--ep-size", "--ep-size",
"2", "2",
"--enable-ep-moe",
], ],
) )
...@@ -75,7 +74,6 @@ class TestEpMoEFP8(CustomTestCase): ...@@ -75,7 +74,6 @@ class TestEpMoEFP8(CustomTestCase):
"2", "2",
"--ep-size", "--ep-size",
"2", "2",
"--enable-ep-moe",
"--quantization", "--quantization",
"fp8", "fp8",
], ],
......
...@@ -33,7 +33,8 @@ class TestTwoBatchOverlap(unittest.TestCase): ...@@ -33,7 +33,8 @@ class TestTwoBatchOverlap(unittest.TestCase):
"--dp", "--dp",
"2", "2",
"--enable-dp-attention", "--enable-dp-attention",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--deepep-mode", "--deepep-mode",
"normal", "normal",
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph "--disable-cuda-graph", # DeepEP normal does not support CUDA Graph
...@@ -122,7 +123,8 @@ class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap): ...@@ -122,7 +123,8 @@ class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
"--dp", "--dp",
"2", "2",
"--enable-dp-attention", "--enable-dp-attention",
"--enable-deepep-moe", "--moe-a2a-backend",
"deepep",
"--deepep-mode", "--deepep-mode",
"normal", "normal",
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph "--disable-cuda-graph", # DeepEP normal does not support CUDA Graph
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment