help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
)
# Data parallelism
# Data parallelism
parser.add_argument(
parser.add_argument(
...
@@ -949,15 +960,6 @@ class ServerArgs:
...
@@ -949,15 +960,6 @@ class ServerArgs:
],
],
)
)
# Expert parallelism
parser.add_argument(
"--expert-parallel-size",
"--ep-size",
type=int,
default=ServerArgs.ep_size,
help="The expert parallelism size.",
)
# Multi-node distributed serving
# Multi-node distributed serving
parser.add_argument(
parser.add_argument(
"--dist-init-addr",
"--dist-init-addr",
...
@@ -1038,21 +1040,6 @@ class ServerArgs:
...
@@ -1038,21 +1040,6 @@ class ServerArgs:
default=ServerArgs.grammar_backend,
default=ServerArgs.grammar_backend,
help="Choose the backend for grammar-guided decoding.",
help="Choose the backend for grammar-guided decoding.",
)
)
parser.add_argument(
"--enable-flashinfer-mla",
action=DeprecatedAction,
help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
)
parser.add_argument(
"--enable-flashmla",
action=DeprecatedAction,
help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
)
parser.add_argument(
"--flashinfer-mla-disable-ragged",
action="store_true",
help="Not using ragged prefill wrapper when running flashinfer mla",
)
# Speculative decoding
# Speculative decoding
parser.add_argument(
parser.add_argument(
...
@@ -1102,6 +1089,109 @@ class ServerArgs:
...
@@ -1102,6 +1089,109 @@ class ServerArgs:
help="The path of the draft model's small vocab table.",
help="The path of the draft model's small vocab table.",
default=ServerArgs.speculative_token_map,
default=ServerArgs.speculative_token_map,
)
)
parser.add_argument(
"--mm-attention-backend",
type=str,
choices=["sdpa","fa3","triton_attn"],
default=ServerArgs.mm_attention_backend,
help="Set multimodal attention backend.",
)
# Expert parallelism
parser.add_argument(
"--expert-parallel-size",
"--ep-size",
type=int,
default=ServerArgs.ep_size,
help="The expert parallelism size.",
)
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
parser.add_argument(
"--deepep-mode",
type=str,
choices=["normal","low_latency","auto"],
default="auto",
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
)
parser.add_argument(
"--ep-num-redundant-experts",
type=int,
default=ServerArgs.ep_num_redundant_experts,
help="Allocate this number of redundant experts in expert parallel.",
)
parser.add_argument(
"--ep-dispatch-algorithm",
type=str,
default=ServerArgs.ep_dispatch_algorithm,
help="The algorithm to choose ranks for redundant experts in expert parallel.",
)
parser.add_argument(
"--init-expert-location",
type=str,
default=ServerArgs.init_expert_location,
help="Initial location of EP experts.",
)
parser.add_argument(
"--enable-eplb",
action="store_true",
help="Enable EPLB algorithm",
)
parser.add_argument(
"--eplb-algorithm",
type=str,
default=ServerArgs.eplb_algorithm,
help="Chosen EPLB algorithm",
)
parser.add_argument(
"--eplb-rebalance-num-iterations",
type=int,
default=ServerArgs.eplb_rebalance_num_iterations,
help="Number of iterations to automatically trigger a EPLB re-balance.",
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
)
parser.add_argument(
"--enable-expert-distribution-metrics",
action="store_true",
help="Enable logging metrics for expert balancedness",
)
parser.add_argument(
"--deepep-config",
type=str,
default=ServerArgs.deepep_config,
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
)
parser.add_argument(
"--moe-dense-tp-size",
type=int,
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
# Double Sparsity
# Double Sparsity
parser.add_argument(
parser.add_argument(
...
@@ -1146,6 +1236,18 @@ class ServerArgs:
...
@@ -1146,6 +1236,18 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Disable RadixAttention for prefix caching.",
help="Disable RadixAttention for prefix caching.",
)
)
parser.add_argument(
"--cuda-graph-max-bs",
type=int,
default=ServerArgs.cuda_graph_max_bs,
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
)
parser.add_argument(
"--cuda-graph-bs",
type=int,
nargs="+",
help="Set the list of batch sizes for cuda graph.",
)
parser.add_argument(
parser.add_argument(
"--disable-cuda-graph",
"--disable-cuda-graph",
action="store_true",
action="store_true",
...
@@ -1156,6 +1258,11 @@ class ServerArgs:
...
@@ -1156,6 +1258,11 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
)
)
parser.add_argument(
"--enable-profile-cuda-graph",
action="store_true",
help="Enable profiling of cuda graph capture.",
)
parser.add_argument(
parser.add_argument(
"--enable-nccl-nvls",
"--enable-nccl-nvls",
action="store_true",
action="store_true",
...
@@ -1186,6 +1293,11 @@ class ServerArgs:
...
@@ -1186,6 +1293,11 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
)
)
parser.add_argument(
"--disable-overlap-cg-plan",
action="store_true",
help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
)
parser.add_argument(
parser.add_argument(
"--enable-mixed-chunk",
"--enable-mixed-chunk",
action="store_true",
action="store_true",
...
@@ -1201,11 +1313,6 @@ class ServerArgs:
...
@@ -1201,11 +1313,6 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
)
)
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
parser.add_argument(
"--enable-two-batch-overlap",
"--enable-two-batch-overlap",
action="store_true",
action="store_true",
...
@@ -1222,18 +1329,6 @@ class ServerArgs:
...
@@ -1222,18 +1329,6 @@ class ServerArgs:
default=ServerArgs.torch_compile_max_bs,
default=ServerArgs.torch_compile_max_bs,
help="Set the maximum batch size when using torch compile.",
help="Set the maximum batch size when using torch compile.",
)
)
parser.add_argument(
"--cuda-graph-max-bs",
type=int,
default=ServerArgs.cuda_graph_max_bs,
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
)
parser.add_argument(
"--cuda-graph-bs",
type=int,
nargs="+",
help="Set the list of batch sizes for cuda graph.",
)
parser.add_argument(
parser.add_argument(
"--torchao-config",
"--torchao-config",
type=str,
type=str,
...
@@ -1290,13 +1385,6 @@ class ServerArgs:
...
@@ -1290,13 +1385,6 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
)
parser.add_argument(
parser.add_argument(
"--enable-hierarchical-cache",
"--enable-hierarchical-cache",
action="store_true",
action="store_true",
...
@@ -1322,86 +1410,9 @@ class ServerArgs:
...
@@ -1322,86 +1410,9 @@ class ServerArgs:
help="The write policy of hierarchical cache.",
help="The write policy of hierarchical cache.",
)
)
parser.add_argument(
parser.add_argument(
"--enable-deepep-moe",
"--flashinfer-mla-disable-ragged",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
parser.add_argument(
"--moe-dense-tp-size",
type=int,
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
parser.add_argument(
"--deepep-mode",
type=str,
choices=["normal","low_latency","auto"],
default="auto",
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
)
parser.add_argument(
"--ep-num-redundant-experts",
type=int,
default=ServerArgs.ep_num_redundant_experts,
help="Allocate this number of redundant experts in expert parallel.",
)
parser.add_argument(
"--ep-dispatch-algorithm",
type=str,
default=ServerArgs.ep_dispatch_algorithm,
help="The algorithm to choose ranks for redundant experts in expert parallel.",
)
parser.add_argument(
"--init-expert-location",
type=str,
default=ServerArgs.init_expert_location,
help="Initial location of EP experts.",
)
parser.add_argument(
"--enable-eplb",
action="store_true",
help="Enable EPLB algorithm",
)
parser.add_argument(
"--eplb-algorithm",
type=str,
default=ServerArgs.eplb_algorithm,
help="Chosen EPLB algorithm",
)
parser.add_argument(
"--eplb-rebalance-num-iterations",
type=int,
default=ServerArgs.eplb_rebalance_num_iterations,
help="Number of iterations to automatically trigger a EPLB re-balance.",
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
)
parser.add_argument(
"--enable-expert-distribution-metrics",
action="store_true",
action="store_true",
help="Enable logging metrics for expert balancedness",
help="Not using ragged prefill wrapper when running flashinfer mla",
)
parser.add_argument(
"--deepep-config",
type=str,
default=ServerArgs.deepep_config,
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
)
)
parser.add_argument(
parser.add_argument(
"--disable-shared-experts-fusion",
"--disable-shared-experts-fusion",
...
@@ -1418,8 +1429,6 @@ class ServerArgs:
...
@@ -1418,8 +1429,6 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Adopt base image processor instead of fast image processor.",
help="Adopt base image processor instead of fast image processor.",
)
)
# Server warmups
parser.add_argument(
parser.add_argument(
"--warmups",
"--warmups",
type=str,
type=str,
...
@@ -1447,6 +1456,11 @@ class ServerArgs:
...
@@ -1447,6 +1456,11 @@ class ServerArgs:
default=ServerArgs.debug_tensor_dump_inject,
default=ServerArgs.debug_tensor_dump_inject,
help="Inject the outputs from jax as the input of every layer.",
help="Inject the outputs from jax as the input of every layer.",
)
)
parser.add_argument(
"--debug-tensor-dump-prefill-only",
action="store_true",
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
)
# Disaggregation
# Disaggregation
parser.add_argument(
parser.add_argument(
...
@@ -1456,12 +1470,6 @@ class ServerArgs:
...
@@ -1456,12 +1470,6 @@ class ServerArgs:
choices=["null","prefill","decode"],
choices=["null","prefill","decode"],
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
)
)
parser.add_argument(
"--disaggregation-bootstrap-port",
type=int,
default=ServerArgs.disaggregation_bootstrap_port,
help="Bootstrap server port on the prefill server. Default is 8998.",
)
parser.add_argument(
parser.add_argument(
"--disaggregation-transfer-backend",
"--disaggregation-transfer-backend",
type=str,
type=str,
...
@@ -1469,6 +1477,12 @@ class ServerArgs:
...
@@ -1469,6 +1477,12 @@ class ServerArgs:
choices=["mooncake","nixl"],
choices=["mooncake","nixl"],
help="The backend for disaggregation transfer. Default is mooncake.",
help="The backend for disaggregation transfer. Default is mooncake.",
)
)
parser.add_argument(
"--disaggregation-bootstrap-port",
type=int,
default=ServerArgs.disaggregation_bootstrap_port,
help="Bootstrap server port on the prefill server. Default is 8998.",