Unverified Commit 6c88f6c8 authored by Cheng Wan's avatar Cheng Wan Committed by GitHub
Browse files

[5/N] MoE Refactor: Update MoE parallelism arguments (#8658)

parent c8d3a402
......@@ -29,6 +29,7 @@ from torch import nn
from transformers import PretrainedConfig
from sglang.srt.distributed import (
get_moe_expert_parallel_world_size,
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
tensor_model_parallel_all_gather,
......@@ -117,7 +118,7 @@ class Grok1MoE(nn.Module):
)
kwargs = {}
if global_server_args_dict["enable_ep_moe"]:
if get_moe_expert_parallel_world_size() > 1:
MoEImpl = EPMoE
else:
MoEImpl = FusedMoE
......@@ -616,8 +617,7 @@ class Grok1ForCausalLM(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
expert_params_mapping = MoEImpl.make_expert_params_mapping(
expert_params_mapping = FusedMoE.make_expert_params_mapping(
ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3",
......
......@@ -24,6 +24,7 @@ from torch import nn
from transformers import MixtralConfig
from sglang.srt.distributed import (
get_moe_expert_parallel_world_size,
get_pp_group,
get_tensor_model_parallel_world_size,
tensor_model_parallel_all_reduce,
......@@ -94,7 +95,7 @@ class MixtralMoE(nn.Module):
renormalize=True,
)
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
MoEImpl = EPMoE if get_moe_expert_parallel_world_size() > 1 else FusedMoE
self.experts = MoEImpl(
num_experts=num_experts,
top_k=top_k,
......@@ -398,8 +399,7 @@ class MixtralForCausalLM(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
expert_params_mapping = MoEImpl.make_expert_params_mapping(
expert_params_mapping = FusedMoE.make_expert_params_mapping(
ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3",
......
......@@ -148,7 +148,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
**(
dict(
enable_flashinfer_cutlass_moe=True,
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
)
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
else {}
......@@ -616,9 +615,7 @@ class Qwen2MoeForCausalLM(nn.Module):
("gate_up_proj", "up_proj", 1),
]
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
expert_params_mapping = MoEImpl.make_expert_params_mapping(
expert_params_mapping = FusedMoE.make_expert_params_mapping(
ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj",
......
......@@ -24,6 +24,7 @@ import torch
from torch import nn
from sglang.srt.distributed import (
get_moe_expert_parallel_world_size,
get_pp_group,
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
......@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
)
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
from sglang.srt.layers.moe.topk import TopK
from sglang.srt.layers.quantization.base_config import QuantizationConfig
from sglang.srt.layers.radix_attention import RadixAttention
......@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
from sglang.srt.models.qwen2_moe import Qwen2MoeModel
from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
from sglang.srt.utils import DeepEPMode, add_prefix, is_cuda, is_non_idle_and_non_empty
from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty
Qwen3MoeConfig = None
......@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
quant_config=quant_config,
prefix=add_prefix("experts", prefix),
**(
dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
if global_server_args_dict["enable_deepep_moe"]
dict(deepep_mode=global_server_args_dict["deepep_mode"])
if global_server_args_dict["moe_a2a_backend"].is_deepep()
else {}
),
# Additional args for FusedMoE
**(
dict(
enable_flashinfer_cutlass_moe=True,
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
)
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
else {}
......@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
prefix=add_prefix("gate", prefix),
)
if global_server_args_dict["enable_deepep_moe"]:
if global_server_args_dict["moe_a2a_backend"].is_deepep():
# TODO: we will support tp < ep in the future
self.ep_size = get_tensor_model_parallel_world_size()
self.ep_size = get_moe_expert_parallel_world_size()
self.num_experts = (
config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
)
......@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
) -> torch.Tensor:
if not global_server_args_dict["enable_deepep_moe"]:
if not global_server_args_dict["moe_a2a_backend"].is_deepep():
return self.forward_normal(hidden_states)
else:
return self.forward_deepep(hidden_states, forward_batch)
......
......@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
prefix=add_prefix("gate", prefix),
)
if global_server_args_dict["enable_deepep_moe"]:
if global_server_args_dict["moe_a2a_backend"].is_deepep():
raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
......
......@@ -4,7 +4,7 @@ from typing import List, Optional
import torch
from sglang.srt import operations
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
from sglang.srt.model_executor.forward_batch_info import ForwardMode
from sglang.srt.operations import Operation
......
......@@ -172,12 +172,11 @@ class ServerArgs:
# Expert parallelism
ep_size: int = 1
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
moe_a2a_backend: Optional[Literal["deepep"]] = None
enable_flashinfer_cutlass_moe: bool = False
enable_flashinfer_trtllm_moe: bool = False
enable_flashinfer_allreduce_fusion: bool = False
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
ep_num_redundant_experts: int = 0
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
init_expert_location: str = "trivial"
......@@ -272,7 +271,27 @@ class ServerArgs:
enable_pdmux: bool = False
sm_group_num: int = 3
# Deprecated arguments
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
def __post_init__(self):
# Check deprecated arguments
def print_deprecated_warning(message: str):
logger.warning(f"\033[33m{message}\033[0m")
if self.enable_ep_moe:
self.ep_size = self.tp_size
print_deprecated_warning(
"NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
)
if self.enable_deepep_moe:
self.moe_a2a_backend = "deepep"
print_deprecated_warning(
"NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
)
# Set missing default values
if self.tokenizer_path is None:
self.tokenizer_path = self.model_path
......@@ -455,14 +474,13 @@ class ServerArgs:
self.quantization == "modelopt_fp4"
), "modelopt_fp4 quantization is required for Flashinfer MOE"
os.environ["TRTLLM_ENABLE_PDL"] = "1"
if self.enable_ep_moe:
self.ep_size = self.tp_size
logger.warning(
f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
)
assert self.ep_size in [
1,
self.tp_size,
], "The expert parallel size must be 1 or the same as the tensor parallel size"
# DeepEP MoE
if self.enable_deepep_moe:
if self.moe_a2a_backend == "deepep":
if self.deepep_mode == "normal":
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
self.disable_cuda_graph = True
......@@ -486,7 +504,7 @@ class ServerArgs:
)
if self.enable_eplb:
assert self.enable_ep_moe or self.enable_deepep_moe
assert self.ep_size > 1 or self.moe_a2a_backend is not None
if self.enable_expert_distribution_metrics and (
self.expert_distribution_recorder_mode is None
......@@ -1354,30 +1372,27 @@ class ServerArgs:
help="The expert parallelism size.",
)
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
"--moe-a2a-backend",
type=str,
choices=["deepep"],
default=ServerArgs.moe_a2a_backend,
help="Choose the backend for MoE A2A.",
)
parser.add_argument(
"--enable-flashinfer-cutlass-moe",
action="store_true",
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
)
parser.add_argument(
"--enable-flashinfer-trtllm-moe",
action="store_true",
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
)
parser.add_argument(
"--enable-flashinfer-allreduce-fusion",
action="store_true",
help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
parser.add_argument(
"--deepep-mode",
type=str,
......@@ -1839,6 +1854,18 @@ class ServerArgs:
help="Disable mmap while loading weight using safetensors.",
)
# Deprecated arguments
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",
help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
)
@classmethod
def from_cli_args(cls, args: argparse.Namespace):
args.tp_size = args.tensor_parallel_size
......
......@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import (
CommunicateSummableTensorPairFn,
ScatterMode,
)
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
from sglang.srt.layers.moe.utils import DeepEPMode
from sglang.srt.layers.quantization import deep_gemm_wrapper
from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
from sglang.srt.operations import execute_operations, execute_overlapped_operations
from sglang.srt.operations_strategy import OperationsStrategy
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
from sglang.srt.utils import BumpAllocator, get_bool_env_var
if TYPE_CHECKING:
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
_tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
......@@ -310,7 +311,7 @@ class TboDPAttentionPreparer:
and not local_batch.forward_mode.is_target_verify()
)
and enable_deepep_moe
and (resolved_deepep_mode == DeepEPMode.low_latency)
and (resolved_deepep_mode == DeepEPMode.LOW_LATENCY)
)
else:
self.local_tbo_split_seq_index = 0
......
......@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
return [nested_list]
class DeepEPMode(Enum):
normal = "normal"
low_latency = "low_latency"
auto = "auto"
def enable_normal(self):
return self in [DeepEPMode.normal, DeepEPMode.auto]
def enable_low_latency(self):
return self in [DeepEPMode.low_latency, DeepEPMode.auto]
def resolve(self, is_extend_in_batch: bool):
if self != DeepEPMode.auto:
return self
if is_extend_in_batch:
return DeepEPMode.normal
else:
return DeepEPMode.low_latency
def is_non_idle_and_non_empty(forward_mode, hidden_states):
return (
(forward_mode is not None)
......@@ -2414,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
return True
elif not server_args.enable_dp_lm_head:
return True
elif not server_args.enable_deepep_moe:
elif server_args.moe_a2a_backend is None:
return True
else:
return (
......@@ -2430,7 +2409,7 @@ def require_attn_tp_gather(server_args):
Check if the input of attention is scattered.
"""
assert server_args.moe_dense_tp_size in [1, None]
if server_args.enable_deepep_moe or server_args.moe_dense_tp_size == 1:
if server_args.moe_a2a_backend is not None or server_args.moe_dense_tp_size == 1:
if server_args.enable_dp_attention:
return server_args.dp_size < server_args.tp_size
else:
......
......@@ -499,7 +499,6 @@ class SRTRunner:
chunked_prefill_size: Optional[int] = None,
dp_size: int = 1,
tokenizer_path: Optional[str] = None,
enable_ep_moe: bool = False,
mem_fraction_static: float = 0.65,
trust_remote_code: bool = False,
speculative_draft_model_path: Optional[str] = None,
......@@ -550,7 +549,6 @@ class SRTRunner:
enable_dp_attention=enable_dp_attention,
dp_size=dp_size,
tokenizer_path=tokenizer_path,
enable_ep_moe=enable_ep_moe,
disable_overlap_schedule=disable_overlap_schedule,
cuda_graph_max_bs=cuda_graph_max_bs,
disable_custom_all_reduce=disable_custom_all_reduce,
......
......@@ -33,7 +33,8 @@ class TestDeepseek(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--enable-two-batch-overlap",
"--ep-num-redundant-experts",
"32",
......@@ -88,7 +89,8 @@ class TestDeepseekMTP(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--enable-two-batch-overlap",
"--ep-num-redundant-experts",
"32",
......
......@@ -31,7 +31,8 @@ class TestPureDP(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
"--max-running-requests",
......@@ -77,7 +78,8 @@ class TestHybridDPTP(CustomTestCase):
"--enable-dp-attention",
"--dp",
"2",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
"--max-running-requests",
......@@ -118,7 +120,8 @@ class TestTP(CustomTestCase):
"--trust-remote-code",
"--tp",
"4",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
"--max-running-requests",
......@@ -166,7 +169,8 @@ class TestNoGatherdBuffer(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -212,7 +216,8 @@ class TestTBO(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--enable-two-batch-overlap",
"--cuda-graph-max-bs",
"128",
......@@ -259,7 +264,8 @@ class TestMTP(CustomTestCase):
"--dp",
"2",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -326,7 +332,8 @@ class TestMTPWithTBO(CustomTestCase):
"--dp-size",
"4",
"--enable-two-batch-overlap",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--trust-remote-code",
"--speculative-algorithm",
"EAGLE",
......
......@@ -34,7 +34,8 @@ class _BaseTestDynamicEPLB(CustomTestCase):
"--dp",
"2",
"--enable-dp-attention",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
......@@ -96,8 +97,7 @@ class TestStaticEPLB(CustomTestCase):
trust_remote_code=True,
ep_num_redundant_experts=4,
enable_dp_attention=True,
enable_deepep_moe=True,
deepep_mode="normal",
moe_a2a_backend="deepep",
disable_cuda_graph=True,
expert_distribution_recorder_mode="stat",
tp_size=2,
......
......@@ -407,9 +407,8 @@ class Test10(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -449,9 +448,8 @@ class Test11(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -491,9 +489,8 @@ class Test12(CustomTestCase):
"--enable-dp-attention",
"--dp",
"8",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -532,9 +529,8 @@ class Test13(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -576,9 +572,8 @@ class Test14(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -620,9 +615,8 @@ class Test15(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -663,9 +657,8 @@ class Test16(CustomTestCase):
"--dp",
"4",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -706,9 +699,8 @@ class Test17(CustomTestCase):
"--dp",
"8",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -751,9 +743,8 @@ class Test18(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -796,9 +787,8 @@ class Test19(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......@@ -835,7 +825,8 @@ class Test20(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -873,7 +864,8 @@ class Test21(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -911,7 +903,8 @@ class Test22(CustomTestCase):
"--enable-dp-attention",
"--dp",
"8",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -948,7 +941,8 @@ class Test23(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -988,7 +982,8 @@ class Test24(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -1028,7 +1023,8 @@ class Test25(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -1067,7 +1063,8 @@ class Test26(CustomTestCase):
"--dp",
"4",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -1106,7 +1103,8 @@ class Test27(CustomTestCase):
"--dp",
"8",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -1147,7 +1145,8 @@ class Test28(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -1188,7 +1187,8 @@ class Test29(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
],
)
......@@ -1701,9 +1701,8 @@ class Test40(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -1755,9 +1754,8 @@ class Test41(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -1809,9 +1807,8 @@ class Test42(CustomTestCase):
"--enable-dp-attention",
"--dp",
"8",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -1862,9 +1859,8 @@ class Test43(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -1918,9 +1914,8 @@ class Test44(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -1974,9 +1969,8 @@ class Test45(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -2029,9 +2023,8 @@ class Test46(CustomTestCase):
"--dp",
"4",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -2084,9 +2077,8 @@ class Test47(CustomTestCase):
"--dp",
"8",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -2141,9 +2133,8 @@ class Test48(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -2198,9 +2189,8 @@ class Test49(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-deepep-moe",
"--deepep-mode",
"auto",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"32",
"--max-running-requests",
......@@ -2249,7 +2239,8 @@ class Test50(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -2297,7 +2288,8 @@ class Test51(CustomTestCase):
"--enable-dp-attention",
"--dp",
"4",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -2345,7 +2337,8 @@ class Test52(CustomTestCase):
"--enable-dp-attention",
"--dp",
"8",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -2392,7 +2385,8 @@ class Test53(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -2442,7 +2436,8 @@ class Test54(CustomTestCase):
"4",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -2492,7 +2487,8 @@ class Test55(CustomTestCase):
"8",
"--moe-dense-tp-size",
"1",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -2541,7 +2537,8 @@ class Test56(CustomTestCase):
"--dp",
"4",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -2590,7 +2587,8 @@ class Test57(CustomTestCase):
"--dp",
"8",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -2641,7 +2639,8 @@ class Test58(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......@@ -2692,7 +2691,8 @@ class Test59(CustomTestCase):
"--moe-dense-tp-size",
"1",
"--enable-dp-lm-head",
"--enable-ep-moe",
"--ep",
"8",
"--speculative-algo",
"EAGLE",
"--speculative-draft",
......
......@@ -27,7 +27,8 @@ class TestPureTP(CustomTestCase):
"--trust-remote-code",
"--tp",
"2",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--disable-cuda-graph",
],
)
......@@ -65,7 +66,8 @@ class TestDPAttn(unittest.TestCase):
"--dp",
"2",
"--enable-dp-attention",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--deepep-mode",
"normal",
"--disable-cuda-graph",
......
......@@ -31,7 +31,8 @@ class TestMoEDeepEPEvalAccuracyLarge(CustomTestCase):
"--trust-remote-code",
"--tp",
"8",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--cuda-graph-max-bs",
"128",
],
......
......@@ -27,7 +27,6 @@ class TestEpMoE(CustomTestCase):
"2",
"--ep-size",
"2",
"--enable-ep-moe",
],
)
......@@ -75,7 +74,6 @@ class TestEpMoEFP8(CustomTestCase):
"2",
"--ep-size",
"2",
"--enable-ep-moe",
"--quantization",
"fp8",
],
......
......@@ -33,7 +33,8 @@ class TestTwoBatchOverlap(unittest.TestCase):
"--dp",
"2",
"--enable-dp-attention",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--deepep-mode",
"normal",
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph
......@@ -122,7 +123,8 @@ class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
"--dp",
"2",
"--enable-dp-attention",
"--enable-deepep-moe",
"--moe-a2a-backend",
"deepep",
"--deepep-mode",
"normal",
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment