You can install SGLang using one of the methods below.
This page primarily applies to common NVIDIA GPU platforms.
For other or newer platforms, please refer to the dedicated pages for [AMD GPUs](../platforms/amd_gpu.md), [Intel Xeon CPUs](../platforms/cpu_server.md), [NVIDIA Jetson](../platforms/nvidia_jetson.md), [Ascend NPUs](../platforms/ascend_npu.md).
For other or newer platforms, please refer to the dedicated pages for [AMD GPUs](../platforms/amd_gpu.md), [Intel Xeon CPUs](../platforms/cpu_server.md), [TPU](../platforms/tpu.md), [NVIDIA DGX Spark](https://lmsys.org/blog/2025-10-13-nvidia-dgx-spark/), [NVIDIA Jetson](../platforms/nvidia_jetson.md), [Ascend NPUs](../platforms/ascend_npu.md).
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
)
parser.add_argument(
"--enable-fp32-lm-head",
action="store_true",
help="If set, the LM head outputs (logits) are in FP32.",
)
parser.add_argument(
"--modelopt-quant",
type=str,
...
...
@@ -1824,18 +1836,6 @@ class ServerArgs:
"This is useful for development and prototyping. For production, it's recommended "
"to use separate quantization and deployment steps.",
help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
)
parser.add_argument(
"--enable-fp32-lm-head",
action="store_true",
help="If set, the LM head outputs (logits) are in FP32.",
)
# Memory and scheduling
parser.add_argument(
...
...
@@ -1940,7 +1940,14 @@ class ServerArgs:
parser.add_argument(
"--disable-hybrid-swa-memory",
action="store_true",
help="Disable the hybrid SWA memory.",
help="Disable the hybrid SWA memory pool.",
)
parser.add_argument(
"--radix-eviction-policy",
type=str,
choices=RADIX_EVICTION_POLICY_CHOICES,
default=ServerArgs.radix_eviction_policy,
help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
)
# Runtime options
...
...
@@ -1950,21 +1957,6 @@ class ServerArgs:
default=ServerArgs.device,
help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
)
parser.add_argument(
"--elastic-ep-backend",
type=str,
default=ServerArgs.elastic_ep_backend,
choices=["none","mooncake"],
help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
"Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
)
parser.add_argument(
"--tensor-parallel-size",
"--tp-size",
...
...
@@ -2252,6 +2244,12 @@ class ServerArgs:
default=ServerArgs.tool_call_parser,
help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
)
parser.add_argument(
"--tool-server",
type=str,
default=None,
help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
)
parser.add_argument(
"--sampling-defaults",
type=str,
...
...
@@ -2262,12 +2260,6 @@ class ServerArgs:
"'model' uses the model's generation_config.json to get the recommended "
"sampling parameters if available. Default is 'model'.",
)
parser.add_argument(
"--tool-server",
type=str,
default=None,
help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
)
# Data parallelism
parser.add_argument(
...
...
@@ -2374,7 +2366,7 @@ class ServerArgs:
parser.add_argument(
"--lora-eviction-policy",
type=str,
default=DEFAULT_LORA_EVICTION_POLICY,
default=ServerArgs.lora_eviction_policy,
choices=["lru","fifo"],
help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
)
...
...
@@ -2686,6 +2678,21 @@ class ServerArgs:
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
parser.add_argument(
"--elastic-ep-backend",
type=str,
default=ServerArgs.elastic_ep_backend,
choices=["none","mooncake"],
help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",