help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
)
# Data parallelism
parser.add_argument(
...
...
@@ -949,15 +960,6 @@ class ServerArgs:
],
)
# Expert parallelism
parser.add_argument(
"--expert-parallel-size",
"--ep-size",
type=int,
default=ServerArgs.ep_size,
help="The expert parallelism size.",
)
# Multi-node distributed serving
parser.add_argument(
"--dist-init-addr",
...
...
@@ -1038,21 +1040,6 @@ class ServerArgs:
default=ServerArgs.grammar_backend,
help="Choose the backend for grammar-guided decoding.",
)
parser.add_argument(
"--enable-flashinfer-mla",
action=DeprecatedAction,
help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
)
parser.add_argument(
"--enable-flashmla",
action=DeprecatedAction,
help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
)
parser.add_argument(
"--flashinfer-mla-disable-ragged",
action="store_true",
help="Not using ragged prefill wrapper when running flashinfer mla",
)
# Speculative decoding
parser.add_argument(
...
...
@@ -1102,6 +1089,109 @@ class ServerArgs:
help="The path of the draft model's small vocab table.",
default=ServerArgs.speculative_token_map,
)
parser.add_argument(
"--mm-attention-backend",
type=str,
choices=["sdpa","fa3","triton_attn"],
default=ServerArgs.mm_attention_backend,
help="Set multimodal attention backend.",
)
# Expert parallelism
parser.add_argument(
"--expert-parallel-size",
"--ep-size",
type=int,
default=ServerArgs.ep_size,
help="The expert parallelism size.",
)
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
parser.add_argument(
"--deepep-mode",
type=str,
choices=["normal","low_latency","auto"],
default="auto",
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
)
parser.add_argument(
"--ep-num-redundant-experts",
type=int,
default=ServerArgs.ep_num_redundant_experts,
help="Allocate this number of redundant experts in expert parallel.",
)
parser.add_argument(
"--ep-dispatch-algorithm",
type=str,
default=ServerArgs.ep_dispatch_algorithm,
help="The algorithm to choose ranks for redundant experts in expert parallel.",
)
parser.add_argument(
"--init-expert-location",
type=str,
default=ServerArgs.init_expert_location,
help="Initial location of EP experts.",
)
parser.add_argument(
"--enable-eplb",
action="store_true",
help="Enable EPLB algorithm",
)
parser.add_argument(
"--eplb-algorithm",
type=str,
default=ServerArgs.eplb_algorithm,
help="Chosen EPLB algorithm",
)
parser.add_argument(
"--eplb-rebalance-num-iterations",
type=int,
default=ServerArgs.eplb_rebalance_num_iterations,
help="Number of iterations to automatically trigger a EPLB re-balance.",
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
)
parser.add_argument(
"--enable-expert-distribution-metrics",
action="store_true",
help="Enable logging metrics for expert balancedness",
)
parser.add_argument(
"--deepep-config",
type=str,
default=ServerArgs.deepep_config,
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
)
parser.add_argument(
"--moe-dense-tp-size",
type=int,
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
# Double Sparsity
parser.add_argument(
...
...
@@ -1146,6 +1236,18 @@ class ServerArgs:
action="store_true",
help="Disable RadixAttention for prefix caching.",
)
parser.add_argument(
"--cuda-graph-max-bs",
type=int,
default=ServerArgs.cuda_graph_max_bs,
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
)
parser.add_argument(
"--cuda-graph-bs",
type=int,
nargs="+",
help="Set the list of batch sizes for cuda graph.",
)
parser.add_argument(
"--disable-cuda-graph",
action="store_true",
...
...
@@ -1156,6 +1258,11 @@ class ServerArgs:
action="store_true",
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
)
parser.add_argument(
"--enable-profile-cuda-graph",
action="store_true",
help="Enable profiling of cuda graph capture.",
)
parser.add_argument(
"--enable-nccl-nvls",
action="store_true",
...
...
@@ -1186,6 +1293,11 @@ class ServerArgs:
action="store_true",
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
)
parser.add_argument(
"--disable-overlap-cg-plan",
action="store_true",
help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
)
parser.add_argument(
"--enable-mixed-chunk",
action="store_true",
...
...
@@ -1201,11 +1313,6 @@ class ServerArgs:
action="store_true",
help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
)
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
"--enable-two-batch-overlap",
action="store_true",
...
...
@@ -1222,18 +1329,6 @@ class ServerArgs:
default=ServerArgs.torch_compile_max_bs,
help="Set the maximum batch size when using torch compile.",
)
parser.add_argument(
"--cuda-graph-max-bs",
type=int,
default=ServerArgs.cuda_graph_max_bs,
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
)
parser.add_argument(
"--cuda-graph-bs",
type=int,
nargs="+",
help="Set the list of batch sizes for cuda graph.",
)
parser.add_argument(
"--torchao-config",
type=str,
...
...
@@ -1290,13 +1385,6 @@ class ServerArgs:
action="store_true",
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
)
parser.add_argument(
"--enable-hierarchical-cache",
action="store_true",
...
...
@@ -1322,86 +1410,9 @@ class ServerArgs:
help="The write policy of hierarchical cache.",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
parser.add_argument(
"--moe-dense-tp-size",
type=int,
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
parser.add_argument(
"--deepep-mode",
type=str,
choices=["normal","low_latency","auto"],
default="auto",
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
)
parser.add_argument(
"--ep-num-redundant-experts",
type=int,
default=ServerArgs.ep_num_redundant_experts,
help="Allocate this number of redundant experts in expert parallel.",
)
parser.add_argument(
"--ep-dispatch-algorithm",
type=str,
default=ServerArgs.ep_dispatch_algorithm,
help="The algorithm to choose ranks for redundant experts in expert parallel.",
)
parser.add_argument(
"--init-expert-location",
type=str,
default=ServerArgs.init_expert_location,
help="Initial location of EP experts.",
)
parser.add_argument(
"--enable-eplb",
action="store_true",
help="Enable EPLB algorithm",
)
parser.add_argument(
"--eplb-algorithm",
type=str,
default=ServerArgs.eplb_algorithm,
help="Chosen EPLB algorithm",
)
parser.add_argument(
"--eplb-rebalance-num-iterations",
type=int,
default=ServerArgs.eplb_rebalance_num_iterations,
help="Number of iterations to automatically trigger a EPLB re-balance.",
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
)
parser.add_argument(
"--enable-expert-distribution-metrics",
"--flashinfer-mla-disable-ragged",
action="store_true",
help="Enable logging metrics for expert balancedness",
)
parser.add_argument(
"--deepep-config",
type=str,
default=ServerArgs.deepep_config,
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
help="Not using ragged prefill wrapper when running flashinfer mla",
)
parser.add_argument(
"--disable-shared-experts-fusion",
...
...
@@ -1418,8 +1429,6 @@ class ServerArgs:
action="store_true",
help="Adopt base image processor instead of fast image processor.",
)
# Server warmups
parser.add_argument(
"--warmups",
type=str,
...
...
@@ -1447,6 +1456,11 @@ class ServerArgs:
default=ServerArgs.debug_tensor_dump_inject,
help="Inject the outputs from jax as the input of every layer.",
)
parser.add_argument(
"--debug-tensor-dump-prefill-only",
action="store_true",
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
)
# Disaggregation
parser.add_argument(
...
...
@@ -1456,12 +1470,6 @@ class ServerArgs:
choices=["null","prefill","decode"],
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
)
parser.add_argument(
"--disaggregation-bootstrap-port",
type=int,
default=ServerArgs.disaggregation_bootstrap_port,
help="Bootstrap server port on the prefill server. Default is 8998.",
)
parser.add_argument(
"--disaggregation-transfer-backend",
type=str,
...
...
@@ -1469,6 +1477,12 @@ class ServerArgs:
choices=["mooncake","nixl"],
help="The backend for disaggregation transfer. Default is mooncake.",
)
parser.add_argument(
"--disaggregation-bootstrap-port",
type=int,
default=ServerArgs.disaggregation_bootstrap_port,
help="Bootstrap server port on the prefill server. Default is 8998.",