Clean up server_args.py (#7037)

6406408a · Lianmin Zheng · GitHub · 019851d0 · 6406408a · 6406408a
Unverified Commit 6406408a authored Jun 10, 2025 by Lianmin Zheng Committed by GitHub Jun 10, 2025
7 changed files
--- a/python/sglang/srt/layers/quantization/deep_gemm.py
+++ b/python/sglang/srt/layers/quantization/deep_gemm.py
@@ -118,7 +118,7 @@ def _compile_warning_1():
    if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
        logger.warning(
            "Entering DeepGEMM JIT Pre-Compile session. "
-            "And it may takes a long time(Typically 10-20 mins) "
+            "It may takes a long time (typically 10-20 mins) "
            "if you have not run `sglang.compile_deep_gemm`. "
            "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
            " for pre-compilation to reduce the overhead if you have not run it before. "

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -72,32 +72,33 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5

 GLOBAL_SERVER_ARGS_KEYS = [
    "attention_backend",
+    "mm_attention_backend",
    "debug_tensor_dump_inject",
    "debug_tensor_dump_output_folder",
    "chunked_prefill_size",
-    "deepep_mode",
    "device",
    "disable_chunked_prefix_cache",
    "disable_radix_cache",
-    "enable_deepep_moe",
    "enable_dp_attention",
    "enable_two_batch_overlap",
    "enable_dp_lm_head",
+    "enable_deepep_moe",
+    "deepep_mode",
    "enable_ep_moe",
+    "moe_dense_tp_size",
+    "ep_dispatch_algorithm",
    "deepep_config",
+    "ep_num_redundant_experts",
    "enable_nan_detection",
    "flashinfer_mla_disable_ragged",
    "max_micro_batch_size",
-    "moe_dense_tp_size",
-    "ep_dispatch_algorithm",
    "disable_shared_experts_fusion",
    "sampling_backend",
    "speculative_accept_threshold_acc",
    "speculative_accept_threshold_single",
    "torchao_config",
    "triton_attention_reduce_in_fp32",
-    "ep_num_redundant_experts",
-    "mm_attention_backend",
+    "num_reserved_decode_tokens",
 ]

 # Put some global args for easy access

--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -17,12 +17,14 @@ from __future__ import annotations

 import bisect
 import inspect
+import logging
 import os
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable, Optional, Union

 import torch
 import tqdm
+from torch.profiler import ProfilerActivity, profile

 from sglang.srt.custom_op import CustomOp
 from sglang.srt.distributed import get_tensor_model_parallel_rank
@@ -40,11 +42,14 @@ from sglang.srt.model_executor.forward_batch_info import (
 from sglang.srt.patch_torch import monkey_patch_torch_compile
 from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
 from sglang.srt.utils import (
+    empty_context,
    get_available_gpu_memory,
    get_device_memory_capacity,
    rank0_log,
 )

+logger = logging.getLogger(__name__)
+
 if TYPE_CHECKING:
    from sglang.srt.model_executor.model_runner import ModelRunner

@@ -207,6 +212,9 @@ class CudaGraphRunner:
            model_runner.server_args.enable_two_batch_overlap
        )
        self.speculative_algorithm = model_runner.server_args.speculative_algorithm
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
        self.tp_size = model_runner.server_args.tp_size
        self.dp_size = model_runner.server_args.dp_size
        self.pp_size = model_runner.server_args.pp_size
@@ -339,44 +347,67 @@ class CudaGraphRunner:

        return is_bs_supported and is_encoder_lens_supported and is_tbo_supported

-    def capture(self):
-        with graph_capture() as graph_capture_context:
-            self.stream = graph_capture_context.stream
-            avail_mem = get_available_gpu_memory(
-                self.model_runner.device, self.model_runner.gpu_id, empty_cache=False
+    def capture(self) -> None:
+        profile_context = empty_context()
+        if self.enable_profile_cuda_graph:
+            profile_context = profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                record_shapes=True,
            )
-            # Reverse the order to enable better memory sharing across cuda graphs.
-            capture_range = (
-                tqdm.tqdm(list(reversed(self.capture_bs)))
-                if get_tensor_model_parallel_rank() == 0
-                else reversed(self.capture_bs)
-            )
-            for bs in capture_range:
-                if get_tensor_model_parallel_rank() == 0:
-                    avail_mem = get_available_gpu_memory(
-                        self.model_runner.device,
-                        self.model_runner.gpu_id,
-                        empty_cache=False,
-                    )
-                    capture_range.set_description(
-                        f"Capturing batches ({avail_mem=:.2f} GB)"
-                    )
-
-                with patch_model(
-                    self.model_runner.model,
-                    bs in self.compile_bs,
-                    num_tokens=bs * self.num_tokens_per_bs,
-                    tp_group=self.model_runner.tp_group,
-                ) as forward:
-                    (
-                        graph,
-                        output_buffers,
-                    ) = self.capture_one_batch_size(bs, forward)
-                    self.graphs[bs] = graph
-                    self.output_buffers[bs] = output_buffers

-                # Save gemlite cache after each capture
-                save_gemlite_cache()
+        with graph_capture() as graph_capture_context:
+            with profile_context as prof:
+                self.stream = graph_capture_context.stream
+                avail_mem = get_available_gpu_memory(
+                    self.model_runner.device,
+                    self.model_runner.gpu_id,
+                    empty_cache=False,
+                )
+                # Reverse the order to enable better memory sharing across cuda graphs.
+                capture_range = (
+                    tqdm.tqdm(list(reversed(self.capture_bs)))
+                    if get_tensor_model_parallel_rank() == 0
+                    else reversed(self.capture_bs)
+                )
+                for i, bs in enumerate(capture_range):
+                    if get_tensor_model_parallel_rank() == 0:
+                        avail_mem = get_available_gpu_memory(
+                            self.model_runner.device,
+                            self.model_runner.gpu_id,
+                            empty_cache=False,
+                        )
+                        capture_range.set_description(
+                            f"Capturing batches ({avail_mem=:.2f} GB)"
+                        )
+
+                    with patch_model(
+                        self.model_runner.model,
+                        bs in self.compile_bs,
+                        num_tokens=bs * self.num_tokens_per_bs,
+                        tp_group=self.model_runner.tp_group,
+                    ) as forward:
+                        (
+                            graph,
+                            output_buffers,
+                        ) = self.capture_one_batch_size(bs, forward)
+                        self.graphs[bs] = graph
+                        self.output_buffers[bs] = output_buffers
+
+                    # Save gemlite cache after each capture
+                    save_gemlite_cache()
+
+        if self.enable_profile_cuda_graph:
+            log_message = (
+                "Sorted by CUDA Time:\n"
+                + prof.key_averages(group_by_input_shape=True).table(
+                    sort_by="cuda_time_total", row_limit=10
+                )
+                + "\n\nSorted by CPU Time:\n"
+                + prof.key_averages(group_by_input_shape=True).table(
+                    sort_by="cpu_time_total", row_limit=10
+                )
+            )
+            logger.info(log_message)

    def capture_one_batch_size(self, bs: int, forward: Callable):
        graph = torch.cuda.CUDAGraph()
@@ -443,7 +474,7 @@ class CudaGraphRunner:
            token_to_kv_pool=self.model_runner.token_to_kv_pool,
            attn_backend=self.model_runner.attn_backend,
            out_cache_loc=out_cache_loc,
-            seq_lens_sum=seq_lens.sum(),
+            seq_lens_sum=seq_lens.sum().item(),
            encoder_lens=encoder_lens,
            return_logprob=False,
            positions=positions,

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -112,14 +112,12 @@ class ServerArgs:
    file_storage_path: str = "sglang_storage"
    enable_cache_report: bool = False
    reasoning_parser: Optional[str] = None
+    tool_call_parser: Optional[str] = None

    # Data parallelism
    dp_size: int = 1
    load_balance_method: str = "round_robin"

-    # Expert parallelism
-    ep_size: int = 1
-
    # Multi-node distributed serving
    dist_init_addr: Optional[str] = None
    nnodes: int = 1
@@ -138,6 +136,7 @@ class ServerArgs:
    attention_backend: Optional[str] = None
    sampling_backend: Optional[str] = None
    grammar_backend: Optional[str] = None
+    mm_attention_backend: Optional[str] = None

    # Speculative decoding
    speculative_algorithm: Optional[str] = None
@@ -149,6 +148,26 @@ class ServerArgs:
    speculative_accept_threshold_acc: float = 1.0
    speculative_token_map: Optional[str] = None

+    # Expert parallelism
+    ep_size: int = 1
+    enable_ep_moe: bool = False
+    enable_deepep_moe: bool = False
+    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
+    ep_num_redundant_experts: int = 0
+    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
+    init_expert_location: str = "trivial"
+    enable_eplb: bool = False
+    eplb_algorithm: str = "auto"
+    eplb_rebalance_num_iterations: int = 1000
+    eplb_rebalance_layers_per_chunk: Optional[int] = None
+    expert_distribution_recorder_mode: Optional[
+        Literal["stat", "stat_approx", "per_pass", "per_token"]
+    ] = None
+    expert_distribution_recorder_buffer_size: Optional[int] = None
+    enable_expert_distribution_metrics: bool = False
+    deepep_config: Optional[str] = None
+    moe_dense_tp_size: Optional[int] = None
+
    # Double Sparsity
    enable_double_sparsity: bool = False
    ds_channel_config_path: Optional[str] = None
@@ -159,38 +178,24 @@ class ServerArgs:

    # Optimization/debug options
    disable_radix_cache: bool = False
+    cuda_graph_max_bs: Optional[int] = None
+    cuda_graph_bs: Optional[List[int]] = None
    disable_cuda_graph: bool = False
    disable_cuda_graph_padding: bool = False
+    enable_profile_cuda_graph: bool = False
    enable_nccl_nvls: bool = False
    enable_tokenizer_batch_encode: bool = False
    disable_outlines_disk_cache: bool = False
    disable_custom_all_reduce: bool = False
    enable_mscclpp: bool = False
    disable_overlap_schedule: bool = False
+    disable_overlap_cg_plan: bool = False
    enable_mixed_chunk: bool = False
    enable_dp_attention: bool = False
    enable_dp_lm_head: bool = False
    enable_two_batch_overlap: bool = False
-    enable_ep_moe: bool = False
-    enable_deepep_moe: bool = False
-    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
-    ep_num_redundant_experts: int = 0
-    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
-    init_expert_location: str = "trivial"
-    enable_eplb: bool = False
-    eplb_algorithm: str = "auto"
-    eplb_rebalance_num_iterations: int = 1000
-    eplb_rebalance_layers_per_chunk: Optional[int] = None
-    expert_distribution_recorder_mode: Optional[
-        Literal["stat", "stat_approx", "per_pass", "per_token"]
-    ] = None
-    expert_distribution_recorder_buffer_size: Optional[int] = None
-    enable_expert_distribution_metrics: bool = False
-    deepep_config: Optional[str] = None
    enable_torch_compile: bool = False
    torch_compile_max_bs: int = 32
-    cuda_graph_max_bs: Optional[int] = None
-    cuda_graph_bs: Optional[List[int]] = None
    torchao_config: str = ""
    enable_nan_detection: bool = False
    enable_p2p_check: bool = False
@@ -201,29 +206,28 @@ class ServerArgs:
    enable_memory_saver: bool = False
    allow_auto_truncate: bool = False
    enable_custom_logit_processor: bool = False
-    tool_call_parser: Optional[str] = None
    enable_hierarchical_cache: bool = False
    hicache_ratio: float = 2.0
    hicache_size: int = 0
    hicache_write_policy: str = "write_through_selective"
    flashinfer_mla_disable_ragged: bool = False
-    warmups: Optional[str] = None
-    moe_dense_tp_size: Optional[int] = None
    disable_shared_experts_fusion: bool = False
    disable_chunked_prefix_cache: bool = False
    disable_fast_image_processor: bool = False
-    mm_attention_backend: Optional[str] = None
+    warmups: Optional[str] = None

    # Debug tensor dumps
    debug_tensor_dump_output_folder: Optional[str] = None
    debug_tensor_dump_input_file: Optional[str] = None
    debug_tensor_dump_inject: bool = False
+    debug_tensor_dump_prefill_only: bool = False

    # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
    disaggregation_mode: str = "null"
-    disaggregation_bootstrap_port: int = 8998
    disaggregation_transfer_backend: str = "mooncake"
+    disaggregation_bootstrap_port: int = 8998
    disaggregation_ib_device: Optional[str] = None
+    num_reserved_decode_tokens: int = 512  # used for decode kv cache offload in PD
    pdlb_url: Optional[str] = None

    def __post_init__(self):
@@ -390,7 +394,7 @@ class ServerArgs:
        if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
            self.expert_distribution_recorder_mode = "stat"
            logger.info(
-                f"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
+                "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
            )

        if (self.enable_eplb or (self.init_expert_location is not None)) and (
@@ -398,7 +402,7 @@ class ServerArgs:
        ):
            self.ep_dispatch_algorithm = "static"
            logger.info(
-                f"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
+                "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
            )

        if self.enable_expert_distribution_metrics and (
@@ -929,6 +933,13 @@ class ServerArgs:
            default=ServerArgs.reasoning_parser,
            help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
        )
+        parser.add_argument(
+            "--tool-call-parser",
+            type=str,
+            choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
+            default=ServerArgs.tool_call_parser,
+            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
+        )

        # Data parallelism
        parser.add_argument(
@@ -949,15 +960,6 @@ class ServerArgs:
            ],
        )

-        # Expert parallelism
-        parser.add_argument(
-            "--expert-parallel-size",
-            "--ep-size",
-            type=int,
-            default=ServerArgs.ep_size,
-            help="The expert parallelism size.",
-        )
-
        # Multi-node distributed serving
        parser.add_argument(
            "--dist-init-addr",
@@ -1038,21 +1040,6 @@ class ServerArgs:
            default=ServerArgs.grammar_backend,
            help="Choose the backend for grammar-guided decoding.",
        )
-        parser.add_argument(
-            "--enable-flashinfer-mla",
-            action=DeprecatedAction,
-            help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
-        )
-        parser.add_argument(
-            "--enable-flashmla",
-            action=DeprecatedAction,
-            help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
-        )
-        parser.add_argument(
-            "--flashinfer-mla-disable-ragged",
-            action="store_true",
-            help="Not using ragged prefill wrapper when running flashinfer mla",
-        )

        # Speculative decoding
        parser.add_argument(
@@ -1102,6 +1089,109 @@ class ServerArgs:
            help="The path of the draft model's small vocab table.",
            default=ServerArgs.speculative_token_map,
        )
+        parser.add_argument(
+            "--mm-attention-backend",
+            type=str,
+            choices=["sdpa", "fa3", "triton_attn"],
+            default=ServerArgs.mm_attention_backend,
+            help="Set multimodal attention backend.",
+        )
+
+        # Expert parallelism
+        parser.add_argument(
+            "--expert-parallel-size",
+            "--ep-size",
+            type=int,
+            default=ServerArgs.ep_size,
+            help="The expert parallelism size.",
+        )
+        parser.add_argument(
+            "--enable-ep-moe",
+            action="store_true",
+            help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
+        )
+        parser.add_argument(
+            "--enable-deepep-moe",
+            action="store_true",
+            help="Enabling DeepEP MoE implementation for EP MoE.",
+        )
+        parser.add_argument(
+            "--deepep-mode",
+            type=str,
+            choices=["normal", "low_latency", "auto"],
+            default="auto",
+            help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
+        )
+        parser.add_argument(
+            "--ep-num-redundant-experts",
+            type=int,
+            default=ServerArgs.ep_num_redundant_experts,
+            help="Allocate this number of redundant experts in expert parallel.",
+        )
+        parser.add_argument(
+            "--ep-dispatch-algorithm",
+            type=str,
+            default=ServerArgs.ep_dispatch_algorithm,
+            help="The algorithm to choose ranks for redundant experts in expert parallel.",
+        )
+        parser.add_argument(
+            "--init-expert-location",
+            type=str,
+            default=ServerArgs.init_expert_location,
+            help="Initial location of EP experts.",
+        )
+        parser.add_argument(
+            "--enable-eplb",
+            action="store_true",
+            help="Enable EPLB algorithm",
+        )
+        parser.add_argument(
+            "--eplb-algorithm",
+            type=str,
+            default=ServerArgs.eplb_algorithm,
+            help="Chosen EPLB algorithm",
+        )
+        parser.add_argument(
+            "--eplb-rebalance-num-iterations",
+            type=int,
+            default=ServerArgs.eplb_rebalance_num_iterations,
+            help="Number of iterations to automatically trigger a EPLB re-balance.",
+        )
+        parser.add_argument(
+            "--eplb-rebalance-layers-per-chunk",
+            type=int,
+            default=ServerArgs.eplb_rebalance_layers_per_chunk,
+            help="Number of layers to rebalance per forward pass.",
+        )
+        parser.add_argument(
+            "--expert-distribution-recorder-mode",
+            type=str,
+            default=ServerArgs.expert_distribution_recorder_mode,
+            help="Mode of expert distribution recorder.",
+        )
+        parser.add_argument(
+            "--expert-distribution-recorder-buffer-size",
+            type=int,
+            default=ServerArgs.expert_distribution_recorder_buffer_size,
+            help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
+        )
+        parser.add_argument(
+            "--enable-expert-distribution-metrics",
+            action="store_true",
+            help="Enable logging metrics for expert balancedness",
+        )
+        parser.add_argument(
+            "--deepep-config",
+            type=str,
+            default=ServerArgs.deepep_config,
+            help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
+        )
+        parser.add_argument(
+            "--moe-dense-tp-size",
+            type=int,
+            default=ServerArgs.moe_dense_tp_size,
+            help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
+        )

        # Double Sparsity
        parser.add_argument(
@@ -1146,6 +1236,18 @@ class ServerArgs:
            action="store_true",
            help="Disable RadixAttention for prefix caching.",
        )
+        parser.add_argument(
+            "--cuda-graph-max-bs",
+            type=int,
+            default=ServerArgs.cuda_graph_max_bs,
+            help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
+        )
+        parser.add_argument(
+            "--cuda-graph-bs",
+            type=int,
+            nargs="+",
+            help="Set the list of batch sizes for cuda graph.",
+        )
        parser.add_argument(
            "--disable-cuda-graph",
            action="store_true",
@@ -1156,6 +1258,11 @@ class ServerArgs:
            action="store_true",
            help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
        )
+        parser.add_argument(
+            "--enable-profile-cuda-graph",
+            action="store_true",
+            help="Enable profiling of cuda graph capture.",
+        )
        parser.add_argument(
            "--enable-nccl-nvls",
            action="store_true",
@@ -1186,6 +1293,11 @@ class ServerArgs:
            action="store_true",
            help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
        )
+        parser.add_argument(
+            "--disable-overlap-cg-plan",
+            action="store_true",
+            help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
+        )
        parser.add_argument(
            "--enable-mixed-chunk",
            action="store_true",
@@ -1201,11 +1313,6 @@ class ServerArgs:
            action="store_true",
            help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
        )
-        parser.add_argument(
-            "--enable-ep-moe",
-            action="store_true",
-            help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
-        )
        parser.add_argument(
            "--enable-two-batch-overlap",
            action="store_true",
@@ -1222,18 +1329,6 @@ class ServerArgs:
            default=ServerArgs.torch_compile_max_bs,
            help="Set the maximum batch size when using torch compile.",
        )
-        parser.add_argument(
-            "--cuda-graph-max-bs",
-            type=int,
-            default=ServerArgs.cuda_graph_max_bs,
-            help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
-        )
-        parser.add_argument(
-            "--cuda-graph-bs",
-            type=int,
-            nargs="+",
-            help="Set the list of batch sizes for cuda graph.",
-        )
        parser.add_argument(
            "--torchao-config",
            type=str,
@@ -1290,13 +1385,6 @@ class ServerArgs:
            action="store_true",
            help="Enable users to pass custom logit processors to the server (disabled by default for security)",
        )
-        parser.add_argument(
-            "--tool-call-parser",
-            type=str,
-            choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
-            default=ServerArgs.tool_call_parser,
-            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
-        )
        parser.add_argument(
            "--enable-hierarchical-cache",
            action="store_true",
@@ -1322,86 +1410,9 @@ class ServerArgs:
            help="The write policy of hierarchical cache.",
        )
        parser.add_argument(
-            "--enable-deepep-moe",
-            action="store_true",
-            help="Enabling DeepEP MoE implementation for EP MoE.",
-        )
-        parser.add_argument(
-            "--moe-dense-tp-size",
-            type=int,
-            default=ServerArgs.moe_dense_tp_size,
-            help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
-        )
-        parser.add_argument(
-            "--deepep-mode",
-            type=str,
-            choices=["normal", "low_latency", "auto"],
-            default="auto",
-            help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
-        )
-        parser.add_argument(
-            "--ep-num-redundant-experts",
-            type=int,
-            default=ServerArgs.ep_num_redundant_experts,
-            help="Allocate this number of redundant experts in expert parallel.",
-        )
-        parser.add_argument(
-            "--ep-dispatch-algorithm",
-            type=str,
-            default=ServerArgs.ep_dispatch_algorithm,
-            help="The algorithm to choose ranks for redundant experts in expert parallel.",
-        )
-        parser.add_argument(
-            "--init-expert-location",
-            type=str,
-            default=ServerArgs.init_expert_location,
-            help="Initial location of EP experts.",
-        )
-        parser.add_argument(
-            "--enable-eplb",
-            action="store_true",
-            help="Enable EPLB algorithm",
-        )
-        parser.add_argument(
-            "--eplb-algorithm",
-            type=str,
-            default=ServerArgs.eplb_algorithm,
-            help="Chosen EPLB algorithm",
-        )
-        parser.add_argument(
-            "--eplb-rebalance-num-iterations",
-            type=int,
-            default=ServerArgs.eplb_rebalance_num_iterations,
-            help="Number of iterations to automatically trigger a EPLB re-balance.",
-        )
-        parser.add_argument(
-            "--eplb-rebalance-layers-per-chunk",
-            type=int,
-            default=ServerArgs.eplb_rebalance_layers_per_chunk,
-            help="Number of layers to rebalance per forward pass.",
-        )
-        parser.add_argument(
-            "--expert-distribution-recorder-mode",
-            type=str,
-            default=ServerArgs.expert_distribution_recorder_mode,
-            help="Mode of expert distribution recorder.",
-        )
-        parser.add_argument(
-            "--expert-distribution-recorder-buffer-size",
-            type=int,
-            default=ServerArgs.expert_distribution_recorder_buffer_size,
-            help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
-        )
-        parser.add_argument(
-            "--enable-expert-distribution-metrics",
+            "--flashinfer-mla-disable-ragged",
            action="store_true",
-            help="Enable logging metrics for expert balancedness",
-        )
-        parser.add_argument(
-            "--deepep-config",
-            type=str,
-            default=ServerArgs.deepep_config,
-            help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
+            help="Not using ragged prefill wrapper when running flashinfer mla",
        )
        parser.add_argument(
            "--disable-shared-experts-fusion",
@@ -1418,8 +1429,6 @@ class ServerArgs:
            action="store_true",
            help="Adopt base image processor instead of fast image processor.",
        )
-
-        # Server warmups
        parser.add_argument(
            "--warmups",
            type=str,
@@ -1447,6 +1456,11 @@ class ServerArgs:
            default=ServerArgs.debug_tensor_dump_inject,
            help="Inject the outputs from jax as the input of every layer.",
        )
+        parser.add_argument(
+            "--debug-tensor-dump-prefill-only",
+            action="store_true",
+            help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
+        )

        # Disaggregation
        parser.add_argument(
@@ -1456,12 +1470,6 @@ class ServerArgs:
            choices=["null", "prefill", "decode"],
            help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
        )
-        parser.add_argument(
-            "--disaggregation-bootstrap-port",
-            type=int,
-            default=ServerArgs.disaggregation_bootstrap_port,
-            help="Bootstrap server port on the prefill server. Default is 8998.",
-        )
        parser.add_argument(
            "--disaggregation-transfer-backend",
            type=str,
@@ -1469,6 +1477,12 @@ class ServerArgs:
            choices=["mooncake", "nixl"],
            help="The backend for disaggregation transfer. Default is mooncake.",
        )
+        parser.add_argument(
+            "--disaggregation-bootstrap-port",
+            type=int,
+            default=ServerArgs.disaggregation_bootstrap_port,
+            help="Bootstrap server port on the prefill server. Default is 8998.",
+        )
        parser.add_argument(
            "--disaggregation-ib-device",
            type=str,
@@ -1477,6 +1491,12 @@ class ServerArgs:
            "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
            "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
        )
+        parser.add_argument(
+            "--num-reserved-decode-tokens",
+            type=int,
+            default=ServerArgs.num_reserved_decode_tokens,
+            help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
+        )
        parser.add_argument(
            "--pdlb-url",
            type=str,
@@ -1484,14 +1504,6 @@ class ServerArgs:
            help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
        )

-        parser.add_argument(
-            "--mm-attention-backend",
-            type=str,
-            choices=["sdpa", "fa3", "triton_attn"],
-            default=ServerArgs.mm_attention_backend,
-            help="Set multimodal attention backend.",
-        )
-
    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        args.tp_size = args.tensor_parallel_size

--- a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
+++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
@@ -41,6 +41,9 @@ class EAGLEDraftCudaGraphRunner:
        self.tp_size = self.model_runner.tp_size
        self.topk = model_runner.server_args.speculative_eagle_topk
        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
        server_args = model_runner.server_args

        # Batch sizes to capture

--- a/python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
+++ b/python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
@@ -39,6 +39,9 @@ class EAGLEDraftExtendCudaGraphRunner:
        self.dp_size = model_runner.server_args.dp_size
        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
        self.topk = model_runner.server_args.speculative_eagle_topk
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
        self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
        self.padded_static_len = -1


--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -837,6 +837,7 @@ class CustomCacheManager(FileCacheManager):


 def set_ulimit(target_soft_limit=65535):
+    # number of open files
    resource_type = resource.RLIMIT_NOFILE
    current_soft, current_hard = resource.getrlimit(resource_type)

@@ -846,6 +847,18 @@ def set_ulimit(target_soft_limit=65535):
        except ValueError as e:
            logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")

+    # stack size
+    resource_type = resource.RLIMIT_STACK
+    current_soft, current_hard = resource.getrlimit(resource_type)
+    target_soft_limit_stack_size = 1024 * target_soft_limit
+    if current_soft < target_soft_limit_stack_size:
+        try:
+            resource.setrlimit(
+                resource_type, (target_soft_limit_stack_size, current_hard)
+            )
+        except ValueError as e:
+            logger.warning(f"Fail to set RLIMIT_STACK: {e}")
+

 def add_api_key_middleware(app, api_key: str):
    @app.middleware("http")