Unverified Commit 4caca1ba authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Clean up server args & Add CI scripts (#12124)

parent ceb105a7
......@@ -3,7 +3,7 @@
You can install SGLang using one of the methods below.
This page primarily applies to common NVIDIA GPU platforms.
For other or newer platforms, please refer to the dedicated pages for [AMD GPUs](../platforms/amd_gpu.md), [Intel Xeon CPUs](../platforms/cpu_server.md), [NVIDIA Jetson](../platforms/nvidia_jetson.md), [Ascend NPUs](../platforms/ascend_npu.md).
For other or newer platforms, please refer to the dedicated pages for [AMD GPUs](../platforms/amd_gpu.md), [Intel Xeon CPUs](../platforms/cpu_server.md), [TPU](../platforms/tpu.md), [NVIDIA DGX Spark](https://lmsys.org/blog/2025-10-13-nvidia-dgx-spark/), [NVIDIA Jetson](../platforms/nvidia_jetson.md), [Ascend NPUs](../platforms/ascend_npu.md).
## Method 1: With pip or uv
......
......@@ -12,10 +12,12 @@ if __name__ == "__main__":
try:
if server_args.grpc_mode:
# Handle gRPC server
from sglang.srt.entrypoints.grpc_server import serve_grpc
asyncio.run(serve_grpc(server_args))
else:
# Handle HTTP server
from sglang.srt.entrypoints.http_server import launch_server
launch_server(server_args)
......
......@@ -30,7 +30,7 @@ from sglang.srt.connector import ConnectorType
from sglang.srt.function_call.function_call_parser import FunctionCallParser
from sglang.srt.lora.lora_registry import LoRARef
from sglang.srt.parser.reasoning_parser import ReasoningParser
from sglang.srt.utils import (
from sglang.srt.utils.common import (
LORA_TARGET_ALL_MODULES,
SUPPORTED_LORA_TARGET_MODULES,
configure_ipv6,
......@@ -127,8 +127,6 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
DEFAULT_LORA_EVICTION_POLICY = "lru"
NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
......@@ -181,6 +179,15 @@ def add_radix_eviction_policy_choices(choices):
@dataclasses.dataclass
class ServerArgs:
"""
The arguments of the server.
NOTE: When you add new arguments, please make sure the order
in this class definition the same as the order in the the function
`ServerArgs.add_cli_args`.
Please follow the existing style to group the new arguments into related groups or create new groups.
"""
# Model and tokenizer
model_path: str
tokenizer_path: Optional[str] = None
......@@ -190,11 +197,6 @@ class ServerArgs:
load_format: str = "auto"
model_loader_extra_config: str = "{}"
trust_remote_code: bool = False
modelopt_quant: Optional[Union[str, Dict]] = None
modelopt_checkpoint_restore_path: Optional[str] = None
modelopt_checkpoint_save_path: Optional[str] = None
modelopt_export_path: Optional[str] = None
quantize_and_serve: bool = False
context_length: Optional[int] = None
is_embedding: bool = False
enable_multimodal: Optional[bool] = None
......@@ -216,6 +218,11 @@ class ServerArgs:
quantization_param_path: Optional[str] = None
kv_cache_dtype: str = "auto"
enable_fp32_lm_head: bool = False
modelopt_quant: Optional[Union[str, Dict]] = None
modelopt_checkpoint_restore_path: Optional[str] = None
modelopt_checkpoint_save_path: Optional[str] = None
modelopt_export_path: Optional[str] = None
quantize_and_serve: bool = False
# Memory and scheduling
mem_fraction_static: Optional[float] = None
......@@ -238,8 +245,6 @@ class ServerArgs:
# Runtime options
device: Optional[str] = None
elastic_ep_backend: Literal[None, "mooncake"] = None
mooncake_ib_device: Optional[str] = None
tp_size: int = 1
pp_size: int = 1
pp_max_micro_batch_size: Optional[int] = None
......@@ -272,10 +277,10 @@ class ServerArgs:
collect_tokens_histogram: bool = False
prompt_tokens_buckets: Optional[List[str]] = None
generation_tokens_buckets: Optional[List[str]] = None
gc_warning_threshold_secs: float = 0.0
decode_log_interval: int = 40
enable_request_time_stats_logging: bool = False
kv_events_config: Optional[str] = None
gc_warning_threshold_secs: float = 0.0
enable_trace: bool = False
oltp_traces_endpoint: str = "localhost:4317"
......@@ -317,7 +322,7 @@ class ServerArgs:
] = None
max_loaded_loras: Optional[int] = None
max_loras_per_batch: int = 8
lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
lora_eviction_policy: str = "lru"
lora_backend: str = "triton"
max_lora_chunk_size: Optional[int] = 16
......@@ -375,6 +380,8 @@ class ServerArgs:
enable_expert_distribution_metrics: bool = False
deepep_config: Optional[str] = None
moe_dense_tp_size: Optional[int] = None
elastic_ep_backend: Literal[None, "mooncake"] = None
mooncake_ib_device: Optional[str] = None
# Mamba cache
max_mamba_cache_size: Optional[int] = None
......@@ -472,8 +479,8 @@ class ServerArgs:
enable_return_hidden_states: bool = False
scheduler_recv_interval: int = 1
numa_node: Optional[List[int]] = None
rl_on_policy_target: Optional[str] = None
enable_deterministic_inference: bool = False
rl_on_policy_target: Optional[str] = None
# Dynamic batch tokenizer
enable_dynamic_batch_tokenizer: bool = False
......@@ -510,19 +517,6 @@ class ServerArgs:
pdmux_config_path: Optional[str] = None
sm_group_num: int = 8
def get_attention_backends(server_args):
prefill_attention_backend_str = (
server_args.prefill_attention_backend
if server_args.prefill_attention_backend
else server_args.attention_backend
)
decode_attention_backend_str = (
server_args.decode_attention_backend
if server_args.decode_attention_backend
else server_args.attention_backend
)
return prefill_attention_backend_str, decode_attention_backend_str
def __post_init__(self):
"""
Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
......@@ -615,22 +609,6 @@ class ServerArgs:
)
self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
def _handle_ktransformers_configs(self):
from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
CompressedTensorsWNA16AMXEPMoEMethod,
override_config,
)
override_config(
CompressedTensorsWNA16AMXEPMoEMethod,
self.kt_num_gpu_experts,
self.kt_cpuinfer,
self.kt_threadpool_count,
self.kt_amx_weight_path,
self.kt_amx_method,
self.chunked_prefill_size,
)
def _handle_missing_default_values(self):
if self.tokenizer_path is None:
self.tokenizer_path = self.model_path
......@@ -685,7 +663,7 @@ class ServerArgs:
self.cuda_graph_max_bs = 64
elif gpu_mem < 35 * 1024:
# A10, 4090, 5090
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
# (chunked_prefill_size 2k, cuda_graph_max_bs 24 if tp < 4 else 80)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 2048
if self.cuda_graph_max_bs is None:
......@@ -693,7 +671,7 @@ class ServerArgs:
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
if self.tp_size < 4:
self.cuda_graph_max_bs = 16
self.cuda_graph_max_bs = 24
else:
self.cuda_graph_max_bs = 80
elif gpu_mem < 60 * 1024:
......@@ -1174,6 +1152,22 @@ class ServerArgs:
if self.grammar_backend is None:
self.grammar_backend = "xgrammar"
def _handle_ktransformers_configs(self):
from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
CompressedTensorsWNA16AMXEPMoEMethod,
override_config,
)
override_config(
CompressedTensorsWNA16AMXEPMoEMethod,
self.kt_num_gpu_experts,
self.kt_cpuinfer,
self.kt_threadpool_count,
self.kt_amx_weight_path,
self.kt_amx_method,
self.chunked_prefill_size,
)
def _handle_data_parallelism(self):
if self.dp_size == 1:
self.enable_dp_attention = False
......@@ -1299,10 +1293,11 @@ class ServerArgs:
raise ValueError(
"Currently standalone speculative decoding does not support dp attention."
)
if self.max_running_requests is None:
self.max_running_requests = 48
logger.warning(
"Max running requests is reset to 48 for speculative decoding."
"Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
)
if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
......@@ -1314,7 +1309,7 @@ class ServerArgs:
if not self.enable_beta_spec:
self.disable_overlap_schedule = True
logger.warning(
"Overlap scheduler is disabled because of using eagle3 and standalone speculative decoding."
"Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
)
if self.enable_mixed_chunk:
......@@ -1383,8 +1378,13 @@ class ServerArgs:
raise ValueError(
"Ngram speculative decoding only supports CUDA device."
)
if self.max_running_requests is None:
self.max_running_requests = 48
logger.warning(
"Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
)
self.disable_overlap_schedule = True
self.enable_mixed_chunk = False
self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
......@@ -1785,6 +1785,18 @@ class ServerArgs:
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
"default to 1.0, which may cause accuracy issues. ",
)
parser.add_argument(
"--kv-cache-dtype",
type=str,
default=ServerArgs.kv_cache_dtype,
choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
)
parser.add_argument(
"--enable-fp32-lm-head",
action="store_true",
help="If set, the LM head outputs (logits) are in FP32.",
)
parser.add_argument(
"--modelopt-quant",
type=str,
......@@ -1824,18 +1836,6 @@ class ServerArgs:
"This is useful for development and prototyping. For production, it's recommended "
"to use separate quantization and deployment steps.",
)
parser.add_argument(
"--kv-cache-dtype",
type=str,
default=ServerArgs.kv_cache_dtype,
choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
)
parser.add_argument(
"--enable-fp32-lm-head",
action="store_true",
help="If set, the LM head outputs (logits) are in FP32.",
)
# Memory and scheduling
parser.add_argument(
......@@ -1940,7 +1940,14 @@ class ServerArgs:
parser.add_argument(
"--disable-hybrid-swa-memory",
action="store_true",
help="Disable the hybrid SWA memory.",
help="Disable the hybrid SWA memory pool.",
)
parser.add_argument(
"--radix-eviction-policy",
type=str,
choices=RADIX_EVICTION_POLICY_CHOICES,
default=ServerArgs.radix_eviction_policy,
help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
)
# Runtime options
......@@ -1950,21 +1957,6 @@ class ServerArgs:
default=ServerArgs.device,
help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
)
parser.add_argument(
"--elastic-ep-backend",
type=str,
default=ServerArgs.elastic_ep_backend,
choices=["none", "mooncake"],
help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
)
parser.add_argument(
"--mooncake-ib-device",
type=str,
default=ServerArgs.mooncake_ib_device,
help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
"(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
"Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
)
parser.add_argument(
"--tensor-parallel-size",
"--tp-size",
......@@ -2252,6 +2244,12 @@ class ServerArgs:
default=ServerArgs.tool_call_parser,
help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
)
parser.add_argument(
"--tool-server",
type=str,
default=None,
help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
)
parser.add_argument(
"--sampling-defaults",
type=str,
......@@ -2262,12 +2260,6 @@ class ServerArgs:
"'model' uses the model's generation_config.json to get the recommended "
"sampling parameters if available. Default is 'model'.",
)
parser.add_argument(
"--tool-server",
type=str,
default=None,
help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
)
# Data parallelism
parser.add_argument(
......@@ -2374,7 +2366,7 @@ class ServerArgs:
parser.add_argument(
"--lora-eviction-policy",
type=str,
default=DEFAULT_LORA_EVICTION_POLICY,
default=ServerArgs.lora_eviction_policy,
choices=["lru", "fifo"],
help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
)
......@@ -2686,6 +2678,21 @@ class ServerArgs:
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
parser.add_argument(
"--elastic-ep-backend",
type=str,
default=ServerArgs.elastic_ep_backend,
choices=["none", "mooncake"],
help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
)
parser.add_argument(
"--mooncake-ib-device",
type=str,
default=ServerArgs.mooncake_ib_device,
help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
"(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
"Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
)
# Mamba Cache
parser.add_argument(
......@@ -2733,13 +2740,6 @@ class ServerArgs:
default=ServerArgs.hicache_write_policy,
help="The write policy of hierarchical cache.",
)
parser.add_argument(
"--radix-eviction-policy",
type=str,
choices=RADIX_EVICTION_POLICY_CHOICES,
default=ServerArgs.radix_eviction_policy,
help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
)
parser.add_argument(
"--hicache-io-backend",
type=str,
......@@ -3153,26 +3153,20 @@ class ServerArgs:
nargs="+",
help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
)
# Debug tensor dumps
parser.add_argument(
"--debug-tensor-dump-output-folder",
type=str,
default=ServerArgs.debug_tensor_dump_output_folder,
help="The output folder for dumping tensors.",
)
parser.add_argument(
"--debug-tensor-dump-input-file",
type=str,
default=ServerArgs.debug_tensor_dump_input_file,
help="The input filename for dumping tensors",
"--enable-deterministic-inference",
action="store_true",
help="Enable deterministic inference mode with batch invariant ops.",
)
parser.add_argument(
"--debug-tensor-dump-inject",
"--rl-on-policy-target",
type=str,
default=ServerArgs.debug_tensor_dump_inject,
help="Inject the outputs from jax as the input of every layer.",
default=ServerArgs.rl_on_policy_target,
choices=["fsdp"],
help="The training system that SGLang needs to match for true on-policy.",
)
# Dynamic batch tokenizer
parser.add_argument(
"--enable-dynamic-batch-tokenizer",
action="store_true",
......@@ -3191,6 +3185,26 @@ class ServerArgs:
help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
)
# Debug tensor dumps
parser.add_argument(
"--debug-tensor-dump-output-folder",
type=str,
default=ServerArgs.debug_tensor_dump_output_folder,
help="The output folder for dumping tensors.",
)
parser.add_argument(
"--debug-tensor-dump-input-file",
type=str,
default=ServerArgs.debug_tensor_dump_input_file,
help="The input filename for dumping tensors",
)
parser.add_argument(
"--debug-tensor-dump-inject",
type=str,
default=ServerArgs.debug_tensor_dump_inject,
help="Inject the outputs from jax as the input of every layer.",
)
# PD disaggregation
parser.add_argument(
"--disaggregation-mode",
......@@ -3300,7 +3314,6 @@ class ServerArgs:
default=None,
help="The path of the PD-Multiplexing config file.",
)
parser.add_argument(
"--sm-group-num",
type=int,
......@@ -3308,57 +3321,6 @@ class ServerArgs:
help="Number of sm partition groups.",
)
# For deterministic inference
parser.add_argument(
"--rl-on-policy-target",
type=str,
default=ServerArgs.rl_on_policy_target,
choices=["fsdp"],
help="The training system that SGLang needs to match for true on-policy.",
)
parser.add_argument(
"--enable-deterministic-inference",
action="store_true",
help="Enable deterministic inference mode with batch invariant ops.",
)
# Deprecated arguments
parser.add_argument(
"--enable-ep-moe",
action=DeprecatedAction,
help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
)
parser.add_argument(
"--enable-deepep-moe",
action=DeprecatedAction,
help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
)
parser.add_argument(
"--enable-flashinfer-cutlass-moe",
action=DeprecatedAction,
help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
)
parser.add_argument(
"--enable-flashinfer-cutedsl-moe",
action=DeprecatedAction,
help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
)
parser.add_argument(
"--enable-flashinfer-trtllm-moe",
action=DeprecatedAction,
help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
)
parser.add_argument(
"--enable-triton-kernel-moe",
action=DeprecatedAction,
help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
)
parser.add_argument(
"--enable-flashinfer-mxfp4-moe",
action=DeprecatedAction,
help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
)
# Configuration file support
parser.add_argument(
"--config",
......@@ -3393,6 +3355,19 @@ class ServerArgs:
)
return hf_config
def get_attention_backends(server_args):
prefill_attention_backend_str = (
server_args.prefill_attention_backend
if server_args.prefill_attention_backend
else server_args.attention_backend
)
decode_attention_backend_str = (
server_args.decode_attention_backend
if server_args.decode_attention_backend
else server_args.attention_backend
)
return prefill_attention_backend_str, decode_attention_backend_str
def check_server_args(self):
# Check parallel size constraints
assert (
......
......@@ -138,6 +138,7 @@ def is_xpu() -> bool:
return hasattr(torch, "xpu") and torch.xpu.is_available()
@lru_cache(maxsize=1)
def is_npu() -> bool:
return hasattr(torch, "npu") and torch.npu.is_available()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment