"Data parallel size is adjusted to be the same as tensor parallel size. "
"Overlap scheduler is disabled."
)
# Expert parallelism
ifself.enable_ep_moe:
self.ep_size=self.tp_size
logger.info(
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
)
# GGUF
if(
...
...
@@ -526,6 +535,14 @@ class ServerArgs:
"shortest_queue",
],
)
# Expert parallelism
parser.add_argument(
"--expert-parallel-size",
"--ep-size",
type=int,
default=ServerArgs.ep_size,
help="The expert parallelism size.",
)
# Multi-node distributed serving
parser.add_argument(
...
...
@@ -681,6 +698,11 @@ class ServerArgs:
action="store_true",
help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
)
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",