Clean up server args & Add CI scripts (#12124)

4caca1ba · Lianmin Zheng · GitHub · ceb105a7 · 4caca1ba · 4caca1ba
Unverified Commit 4caca1ba authored Oct 25, 2025 by Lianmin Zheng Committed by GitHub Oct 25, 2025
4 changed files
--- a/docs/get_started/install.md
+++ b/docs/get_started/install.md
@@ -3,7 +3,7 @@
 You can install SGLang using one of the methods below.

 This page primarily applies to common NVIDIA GPU platforms.
-For other or newer platforms, please refer to the dedicated pages for [AMD GPUs](../platforms/amd_gpu.md), [Intel Xeon CPUs](../platforms/cpu_server.md), [NVIDIA Jetson](../platforms/nvidia_jetson.md), [Ascend NPUs](../platforms/ascend_npu.md).
+For other or newer platforms, please refer to the dedicated pages for [AMD GPUs](../platforms/amd_gpu.md), [Intel Xeon CPUs](../platforms/cpu_server.md), [TPU](../platforms/tpu.md), [NVIDIA DGX Spark](https://lmsys.org/blog/2025-10-13-nvidia-dgx-spark/), [NVIDIA Jetson](../platforms/nvidia_jetson.md), [Ascend NPUs](../platforms/ascend_npu.md).

 ## Method 1: With pip or uv


--- a/python/sglang/launch_server.py
+++ b/python/sglang/launch_server.py
@@ -12,10 +12,12 @@ if __name__ == "__main__":

    try:
        if server_args.grpc_mode:
+            # Handle gRPC server
            from sglang.srt.entrypoints.grpc_server import serve_grpc

            asyncio.run(serve_grpc(server_args))
        else:
+            # Handle HTTP server
            from sglang.srt.entrypoints.http_server import launch_server

            launch_server(server_args)

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -30,7 +30,7 @@ from sglang.srt.connector import ConnectorType
 from sglang.srt.function_call.function_call_parser import FunctionCallParser
 from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.parser.reasoning_parser import ReasoningParser
-from sglang.srt.utils import (
+from sglang.srt.utils.common import (
    LORA_TARGET_ALL_MODULES,
    SUPPORTED_LORA_TARGET_MODULES,
    configure_ipv6,
@@ -127,8 +127,6 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]

 DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]

-DEFAULT_LORA_EVICTION_POLICY = "lru"
-
 NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]

 RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
@@ -181,6 +179,15 @@ def add_radix_eviction_policy_choices(choices):

 @dataclasses.dataclass
 class ServerArgs:
+    """
+    The arguments of the server.
+
+    NOTE: When you add new arguments, please make sure the order
+    in this class definition the same as the order in the the function
+    `ServerArgs.add_cli_args`.
+    Please follow the existing style to group the new arguments into related groups or create new groups.
+    """
+
    # Model and tokenizer
    model_path: str
    tokenizer_path: Optional[str] = None
@@ -190,11 +197,6 @@ class ServerArgs:
    load_format: str = "auto"
    model_loader_extra_config: str = "{}"
    trust_remote_code: bool = False
-    modelopt_quant: Optional[Union[str, Dict]] = None
-    modelopt_checkpoint_restore_path: Optional[str] = None
-    modelopt_checkpoint_save_path: Optional[str] = None
-    modelopt_export_path: Optional[str] = None
-    quantize_and_serve: bool = False
    context_length: Optional[int] = None
    is_embedding: bool = False
    enable_multimodal: Optional[bool] = None
@@ -216,6 +218,11 @@ class ServerArgs:
    quantization_param_path: Optional[str] = None
    kv_cache_dtype: str = "auto"
    enable_fp32_lm_head: bool = False
+    modelopt_quant: Optional[Union[str, Dict]] = None
+    modelopt_checkpoint_restore_path: Optional[str] = None
+    modelopt_checkpoint_save_path: Optional[str] = None
+    modelopt_export_path: Optional[str] = None
+    quantize_and_serve: bool = False

    # Memory and scheduling
    mem_fraction_static: Optional[float] = None
@@ -238,8 +245,6 @@ class ServerArgs:

    # Runtime options
    device: Optional[str] = None
-    elastic_ep_backend: Literal[None, "mooncake"] = None
-    mooncake_ib_device: Optional[str] = None
    tp_size: int = 1
    pp_size: int = 1
    pp_max_micro_batch_size: Optional[int] = None
@@ -272,10 +277,10 @@ class ServerArgs:
    collect_tokens_histogram: bool = False
    prompt_tokens_buckets: Optional[List[str]] = None
    generation_tokens_buckets: Optional[List[str]] = None
+    gc_warning_threshold_secs: float = 0.0
    decode_log_interval: int = 40
    enable_request_time_stats_logging: bool = False
    kv_events_config: Optional[str] = None
-    gc_warning_threshold_secs: float = 0.0
    enable_trace: bool = False
    oltp_traces_endpoint: str = "localhost:4317"

@@ -317,7 +322,7 @@ class ServerArgs:
    ] = None
    max_loaded_loras: Optional[int] = None
    max_loras_per_batch: int = 8
-    lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
+    lora_eviction_policy: str = "lru"
    lora_backend: str = "triton"
    max_lora_chunk_size: Optional[int] = 16

@@ -375,6 +380,8 @@ class ServerArgs:
    enable_expert_distribution_metrics: bool = False
    deepep_config: Optional[str] = None
    moe_dense_tp_size: Optional[int] = None
+    elastic_ep_backend: Literal[None, "mooncake"] = None
+    mooncake_ib_device: Optional[str] = None

    # Mamba cache
    max_mamba_cache_size: Optional[int] = None
@@ -472,8 +479,8 @@ class ServerArgs:
    enable_return_hidden_states: bool = False
    scheduler_recv_interval: int = 1
    numa_node: Optional[List[int]] = None
-    rl_on_policy_target: Optional[str] = None
    enable_deterministic_inference: bool = False
+    rl_on_policy_target: Optional[str] = None

    # Dynamic batch tokenizer
    enable_dynamic_batch_tokenizer: bool = False
@@ -510,19 +517,6 @@ class ServerArgs:
    pdmux_config_path: Optional[str] = None
    sm_group_num: int = 8

-    def get_attention_backends(server_args):
-        prefill_attention_backend_str = (
-            server_args.prefill_attention_backend
-            if server_args.prefill_attention_backend
-            else server_args.attention_backend
-        )
-        decode_attention_backend_str = (
-            server_args.decode_attention_backend
-            if server_args.decode_attention_backend
-            else server_args.attention_backend
-        )
-        return prefill_attention_backend_str, decode_attention_backend_str
-
    def __post_init__(self):
        """
        Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
@@ -615,22 +609,6 @@ class ServerArgs:
            )
            self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]

-    def _handle_ktransformers_configs(self):
-        from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
-            CompressedTensorsWNA16AMXEPMoEMethod,
-            override_config,
-        )
-
-        override_config(
-            CompressedTensorsWNA16AMXEPMoEMethod,
-            self.kt_num_gpu_experts,
-            self.kt_cpuinfer,
-            self.kt_threadpool_count,
-            self.kt_amx_weight_path,
-            self.kt_amx_method,
-            self.chunked_prefill_size,
-        )
-
    def _handle_missing_default_values(self):
        if self.tokenizer_path is None:
            self.tokenizer_path = self.model_path
@@ -685,7 +663,7 @@ class ServerArgs:
                        self.cuda_graph_max_bs = 64
            elif gpu_mem < 35 * 1024:
                # A10, 4090, 5090
-                # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
+                # (chunked_prefill_size 2k, cuda_graph_max_bs 24 if tp < 4 else 80)
                if self.chunked_prefill_size is None:
                    self.chunked_prefill_size = 2048
                if self.cuda_graph_max_bs is None:
@@ -693,7 +671,7 @@ class ServerArgs:
                    # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
                    # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
                    if self.tp_size < 4:
-                        self.cuda_graph_max_bs = 16
+                        self.cuda_graph_max_bs = 24
                    else:
                        self.cuda_graph_max_bs = 80
            elif gpu_mem < 60 * 1024:
@@ -1174,6 +1152,22 @@ class ServerArgs:
        if self.grammar_backend is None:
            self.grammar_backend = "xgrammar"

+    def _handle_ktransformers_configs(self):
+        from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+            CompressedTensorsWNA16AMXEPMoEMethod,
+            override_config,
+        )
+
+        override_config(
+            CompressedTensorsWNA16AMXEPMoEMethod,
+            self.kt_num_gpu_experts,
+            self.kt_cpuinfer,
+            self.kt_threadpool_count,
+            self.kt_amx_weight_path,
+            self.kt_amx_method,
+            self.chunked_prefill_size,
+        )
+
    def _handle_data_parallelism(self):
        if self.dp_size == 1:
            self.enable_dp_attention = False
@@ -1299,10 +1293,11 @@ class ServerArgs:
                raise ValueError(
                    "Currently standalone speculative decoding does not support dp attention."
                )
+
            if self.max_running_requests is None:
                self.max_running_requests = 48
                logger.warning(
-                    "Max running requests is reset to 48 for speculative decoding."
+                    "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
                )

            if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
@@ -1314,7 +1309,7 @@ class ServerArgs:
            if not self.enable_beta_spec:
                self.disable_overlap_schedule = True
                logger.warning(
-                    "Overlap scheduler is disabled because of using eagle3 and standalone speculative decoding."
+                    "Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
                )

            if self.enable_mixed_chunk:
@@ -1383,8 +1378,13 @@ class ServerArgs:
                raise ValueError(
                    "Ngram speculative decoding only supports CUDA device."
                )
+
            if self.max_running_requests is None:
                self.max_running_requests = 48
+                logger.warning(
+                    "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
+                )
+
            self.disable_overlap_schedule = True
            self.enable_mixed_chunk = False
            self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
@@ -1785,6 +1785,18 @@ class ServerArgs:
            "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
            "default to 1.0, which may cause accuracy issues. ",
        )
+        parser.add_argument(
+            "--kv-cache-dtype",
+            type=str,
+            default=ServerArgs.kv_cache_dtype,
+            choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
+            help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
+        )
+        parser.add_argument(
+            "--enable-fp32-lm-head",
+            action="store_true",
+            help="If set, the LM head outputs (logits) are in FP32.",
+        )
        parser.add_argument(
            "--modelopt-quant",
            type=str,
@@ -1824,18 +1836,6 @@ class ServerArgs:
            "This is useful for development and prototyping. For production, it's recommended "
            "to use separate quantization and deployment steps.",
        )
-        parser.add_argument(
-            "--kv-cache-dtype",
-            type=str,
-            default=ServerArgs.kv_cache_dtype,
-            choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
-            help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
-        )
-        parser.add_argument(
-            "--enable-fp32-lm-head",
-            action="store_true",
-            help="If set, the LM head outputs (logits) are in FP32.",
-        )

        # Memory and scheduling
        parser.add_argument(
@@ -1940,7 +1940,14 @@ class ServerArgs:
        parser.add_argument(
            "--disable-hybrid-swa-memory",
            action="store_true",
-            help="Disable the hybrid SWA memory.",
+            help="Disable the hybrid SWA memory pool.",
+        )
+        parser.add_argument(
+            "--radix-eviction-policy",
+            type=str,
+            choices=RADIX_EVICTION_POLICY_CHOICES,
+            default=ServerArgs.radix_eviction_policy,
+            help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
        )

        # Runtime options
@@ -1950,21 +1957,6 @@ class ServerArgs:
            default=ServerArgs.device,
            help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
        )
-        parser.add_argument(
-            "--elastic-ep-backend",
-            type=str,
-            default=ServerArgs.elastic_ep_backend,
-            choices=["none", "mooncake"],
-            help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
-        )
-        parser.add_argument(
-            "--mooncake-ib-device",
-            type=str,
-            default=ServerArgs.mooncake_ib_device,
-            help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
-            "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
-            "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
-        )
        parser.add_argument(
            "--tensor-parallel-size",
            "--tp-size",
@@ -2252,6 +2244,12 @@ class ServerArgs:
            default=ServerArgs.tool_call_parser,
            help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
        )
+        parser.add_argument(
+            "--tool-server",
+            type=str,
+            default=None,
+            help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
+        )
        parser.add_argument(
            "--sampling-defaults",
            type=str,
@@ -2262,12 +2260,6 @@ class ServerArgs:
            "'model' uses the model's generation_config.json to get the recommended "
            "sampling parameters if available. Default is 'model'.",
        )
-        parser.add_argument(
-            "--tool-server",
-            type=str,
-            default=None,
-            help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
-        )

        # Data parallelism
        parser.add_argument(
@@ -2374,7 +2366,7 @@ class ServerArgs:
        parser.add_argument(
            "--lora-eviction-policy",
            type=str,
-            default=DEFAULT_LORA_EVICTION_POLICY,
+            default=ServerArgs.lora_eviction_policy,
            choices=["lru", "fifo"],
            help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
        )
@@ -2686,6 +2678,21 @@ class ServerArgs:
            default=ServerArgs.moe_dense_tp_size,
            help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
        )
+        parser.add_argument(
+            "--elastic-ep-backend",
+            type=str,
+            default=ServerArgs.elastic_ep_backend,
+            choices=["none", "mooncake"],
+            help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
+        )
+        parser.add_argument(
+            "--mooncake-ib-device",
+            type=str,
+            default=ServerArgs.mooncake_ib_device,
+            help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
+            "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
+            "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
+        )

        # Mamba Cache
        parser.add_argument(
@@ -2733,13 +2740,6 @@ class ServerArgs:
            default=ServerArgs.hicache_write_policy,
            help="The write policy of hierarchical cache.",
        )
-        parser.add_argument(
-            "--radix-eviction-policy",
-            type=str,
-            choices=RADIX_EVICTION_POLICY_CHOICES,
-            default=ServerArgs.radix_eviction_policy,
-            help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
-        )
        parser.add_argument(
            "--hicache-io-backend",
            type=str,
@@ -3153,26 +3153,20 @@ class ServerArgs:
            nargs="+",
            help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
        )
-
-        # Debug tensor dumps
        parser.add_argument(
-            "--debug-tensor-dump-output-folder",
-            type=str,
-            default=ServerArgs.debug_tensor_dump_output_folder,
-            help="The output folder for dumping tensors.",
-        )
-        parser.add_argument(
-            "--debug-tensor-dump-input-file",
-            type=str,
-            default=ServerArgs.debug_tensor_dump_input_file,
-            help="The input filename for dumping tensors",
+            "--enable-deterministic-inference",
+            action="store_true",
+            help="Enable deterministic inference mode with batch invariant ops.",
        )
        parser.add_argument(
-            "--debug-tensor-dump-inject",
+            "--rl-on-policy-target",
            type=str,
-            default=ServerArgs.debug_tensor_dump_inject,
-            help="Inject the outputs from jax as the input of every layer.",
+            default=ServerArgs.rl_on_policy_target,
+            choices=["fsdp"],
+            help="The training system that SGLang needs to match for true on-policy.",
        )
+
+        # Dynamic batch tokenizer
        parser.add_argument(
            "--enable-dynamic-batch-tokenizer",
            action="store_true",
@@ -3191,6 +3185,26 @@ class ServerArgs:
            help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
        )

+        # Debug tensor dumps
+        parser.add_argument(
+            "--debug-tensor-dump-output-folder",
+            type=str,
+            default=ServerArgs.debug_tensor_dump_output_folder,
+            help="The output folder for dumping tensors.",
+        )
+        parser.add_argument(
+            "--debug-tensor-dump-input-file",
+            type=str,
+            default=ServerArgs.debug_tensor_dump_input_file,
+            help="The input filename for dumping tensors",
+        )
+        parser.add_argument(
+            "--debug-tensor-dump-inject",
+            type=str,
+            default=ServerArgs.debug_tensor_dump_inject,
+            help="Inject the outputs from jax as the input of every layer.",
+        )
+
        # PD disaggregation
        parser.add_argument(
            "--disaggregation-mode",
@@ -3300,7 +3314,6 @@ class ServerArgs:
            default=None,
            help="The path of the PD-Multiplexing config file.",
        )
-
        parser.add_argument(
            "--sm-group-num",
            type=int,
@@ -3308,57 +3321,6 @@ class ServerArgs:
            help="Number of sm partition groups.",
        )

-        # For deterministic inference
-        parser.add_argument(
-            "--rl-on-policy-target",
-            type=str,
-            default=ServerArgs.rl_on_policy_target,
-            choices=["fsdp"],
-            help="The training system that SGLang needs to match for true on-policy.",
-        )
-        parser.add_argument(
-            "--enable-deterministic-inference",
-            action="store_true",
-            help="Enable deterministic inference mode with batch invariant ops.",
-        )
-
-        # Deprecated arguments
-        parser.add_argument(
-            "--enable-ep-moe",
-            action=DeprecatedAction,
-            help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
-        )
-        parser.add_argument(
-            "--enable-deepep-moe",
-            action=DeprecatedAction,
-            help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
-        )
-        parser.add_argument(
-            "--enable-flashinfer-cutlass-moe",
-            action=DeprecatedAction,
-            help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
-        )
-        parser.add_argument(
-            "--enable-flashinfer-cutedsl-moe",
-            action=DeprecatedAction,
-            help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
-        )
-        parser.add_argument(
-            "--enable-flashinfer-trtllm-moe",
-            action=DeprecatedAction,
-            help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
-        )
-        parser.add_argument(
-            "--enable-triton-kernel-moe",
-            action=DeprecatedAction,
-            help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
-        )
-        parser.add_argument(
-            "--enable-flashinfer-mxfp4-moe",
-            action=DeprecatedAction,
-            help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
-        )
-
        # Configuration file support
        parser.add_argument(
            "--config",
@@ -3393,6 +3355,19 @@ class ServerArgs:
        )
        return hf_config

+    def get_attention_backends(server_args):
+        prefill_attention_backend_str = (
+            server_args.prefill_attention_backend
+            if server_args.prefill_attention_backend
+            else server_args.attention_backend
+        )
+        decode_attention_backend_str = (
+            server_args.decode_attention_backend
+            if server_args.decode_attention_backend
+            else server_args.attention_backend
+        )
+        return prefill_attention_backend_str, decode_attention_backend_str
+
    def check_server_args(self):
        # Check parallel size constraints
        assert (

--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -138,6 +138,7 @@ def is_xpu() -> bool:
    return hasattr(torch, "xpu") and torch.xpu.is_available()


+@lru_cache(maxsize=1)
 def is_npu() -> bool:
    return hasattr(torch, "npu") and torch.npu.is_available()