},f"moe_dense_tp_size only support 1 and None currently"
ifself.attention_backend=="flashmla":
ifself.attention_backend=="flashmla":
logger.warning(
logger.warning(
"FlashMLA only supports a page_size of 64, change page_size to 64."
"FlashMLA only supports a page_size of 64, change page_size to 64."
...
@@ -1101,6 +1107,12 @@ class ServerArgs:
...
@@ -1101,6 +1107,12 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
)
parser.add_argument(
"--moe-dense-tp-size",
type=int,
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",