f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE workload issue. "
"The CUDA graph is disabled."
)
ifself.enable_overlap_schedule:
ifself.enable_overlap_schedule:
logger.warning(
logger.warning(
"Overlap scheduler mode is enabled. This is an experimental feature. "
"Overlap scheduler mode is enabled. This is an experimental feature. "
...
@@ -669,6 +680,11 @@ class ServerArgs:
...
@@ -669,6 +680,11 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Enabling mixing prefill and decode in a batch when using chunked prefill.",
help="Enabling mixing prefill and decode in a batch when using chunked prefill.",
)
)
parser.add_argument(
"--enable-dp-attention",
action="store_true",
help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",