Clean up server_args.py (#7037)

6406408a · Lianmin Zheng · GitHub · 019851d0 · 6406408a · 6406408a
Unverified Commit 6406408a authored Jun 10, 2025 by Lianmin Zheng Committed by GitHub Jun 10, 2025
7 changed files
--- a/python/sglang/srt/layers/quantization/deep_gemm.py
+++ b/python/sglang/srt/layers/quantization/deep_gemm.py
@@ -118,7 +118,7 @@ def _compile_warning_1():
    if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
        logger.warning(
            "Entering DeepGEMM JIT Pre-Compile session. "
-            "And it may takes a long time(Typically 10-20 mins) "
+            "It may takes a long time (typically 10-20 mins) "
            "if you have not run `sglang.compile_deep_gemm`. "
            "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
            " for pre-compilation to reduce the overhead if you have not run it before. "

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -72,32 +72,33 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 GLOBAL_SERVER_ARGS_KEYS = [
    "attention_backend",
+    "mm_attention_backend",
    "debug_tensor_dump_inject",
    "debug_tensor_dump_output_folder",
    "chunked_prefill_size",
-    "deepep_mode",
    "device",
    "disable_chunked_prefix_cache",
    "disable_radix_cache",
-    "enable_deepep_moe",
    "enable_dp_attention",
    "enable_two_batch_overlap",
    "enable_dp_lm_head",
+    "enable_deepep_moe",
+    "deepep_mode",
    "enable_ep_moe",
+    "moe_dense_tp_size",
+    "ep_dispatch_algorithm",
    "deepep_config",
+    "ep_num_redundant_experts",
    "enable_nan_detection",
    "flashinfer_mla_disable_ragged",
    "max_micro_batch_size",
-    "moe_dense_tp_size",
-    "ep_dispatch_algorithm",
    "disable_shared_experts_fusion",
    "sampling_backend",
    "speculative_accept_threshold_acc",
    "speculative_accept_threshold_single",
    "torchao_config",
    "triton_attention_reduce_in_fp32",
-    "ep_num_redundant_experts",
+    "num_reserved_decode_tokens",
-    "mm_attention_backend",
 ]
 # Put some global args for easy access

--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -17,12 +17,14 @@ from __future__ import annotations
 import bisect
 import inspect
+import logging
 import os
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable, Optional, Union
 import torch
 import tqdm
+from torch.profiler import ProfilerActivity, profile
 from sglang.srt.custom_op import CustomOp
 from sglang.srt.distributed import get_tensor_model_parallel_rank
@@ -40,11 +42,14 @@ from sglang.srt.model_executor.forward_batch_info import (
 from sglang.srt.patch_torch import monkey_patch_torch_compile
 from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
 from sglang.srt.utils import (
+    empty_context,
    get_available_gpu_memory,
    get_device_memory_capacity,
    rank0_log,
 )
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
    from sglang.srt.model_executor.model_runner import ModelRunner
@@ -207,6 +212,9 @@ class CudaGraphRunner:
            model_runner.server_args.enable_two_batch_overlap
        )
        self.speculative_algorithm = model_runner.server_args.speculative_algorithm
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
        self.tp_size = model_runner.server_args.tp_size
        self.dp_size = model_runner.server_args.dp_size
        self.pp_size = model_runner.server_args.pp_size
@@ -339,44 +347,67 @@ class CudaGraphRunner:
        return is_bs_supported and is_encoder_lens_supported and is_tbo_supported
-    def capture(self):
+    def capture(self) -> None:
-        with graph_capture() as graph_capture_context:
+        profile_context = empty_context()
-            self.stream = graph_capture_context.stream
+        if self.enable_profile_cuda_graph:
-            avail_mem = get_available_gpu_memory(
+            profile_context = profile(
-                self.model_runner.device, self.model_runner.gpu_id, empty_cache=False
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                record_shapes=True,
            )
-            # Reverse the order to enable better memory sharing across cuda graphs.
-            capture_range = (
-                tqdm.tqdm(list(reversed(self.capture_bs)))
-                if get_tensor_model_parallel_rank() == 0
-                else reversed(self.capture_bs)
-            )
-            for bs in capture_range:
-                if get_tensor_model_parallel_rank() == 0:
-                    avail_mem = get_available_gpu_memory(
-                        self.model_runner.device,
-                        self.model_runner.gpu_id,
-                        empty_cache=False,
-                    )
-                    capture_range.set_description(
-                        f"Capturing batches ({avail_mem=:.2f} GB)"
-                    )
-                with patch_model(
-                    self.model_runner.model,
-                    bs in self.compile_bs,
-                    num_tokens=bs * self.num_tokens_per_bs,
-                    tp_group=self.model_runner.tp_group,
-                ) as forward:
-                    (
-                        graph,
-                        output_buffers,
-                    ) = self.capture_one_batch_size(bs, forward)
-                    self.graphs[bs] = graph
-                    self.output_buffers[bs] = output_buffers
-                # Save gemlite cache after each capture
+        with graph_capture() as graph_capture_context:
-                save_gemlite_cache()
+            with profile_context as prof:
+                self.stream = graph_capture_context.stream
+                avail_mem = get_available_gpu_memory(
+                    self.model_runner.device,
+                    self.model_runner.gpu_id,
+                    empty_cache=False,
+                )
+                # Reverse the order to enable better memory sharing across cuda graphs.
+                capture_range = (
+                    tqdm.tqdm(list(reversed(self.capture_bs)))
+                    if get_tensor_model_parallel_rank() == 0
+                    else reversed(self.capture_bs)
+                )
+                for i, bs in enumerate(capture_range):
+                    if get_tensor_model_parallel_rank() == 0:
+                        avail_mem = get_available_gpu_memory(
+                            self.model_runner.device,
+                            self.model_runner.gpu_id,
+                            empty_cache=False,
+                        )
+                        capture_range.set_description(
+                            f"Capturing batches ({avail_mem=:.2f} GB)"
+                        )
+                    with patch_model(
+                        self.model_runner.model,
+                        bs in self.compile_bs,
+                        num_tokens=bs * self.num_tokens_per_bs,
+                        tp_group=self.model_runner.tp_group,
+                    ) as forward:
+                        (
+                            graph,
+                            output_buffers,
+                        ) = self.capture_one_batch_size(bs, forward)
+                        self.graphs[bs] = graph
+                        self.output_buffers[bs] = output_buffers
+                    # Save gemlite cache after each capture
+                    save_gemlite_cache()
+        if self.enable_profile_cuda_graph:
+            log_message = (
+                "Sorted by CUDA Time:\n"
+                + prof.key_averages(group_by_input_shape=True).table(
+                    sort_by="cuda_time_total", row_limit=10
+                )
+                + "\n\nSorted by CPU Time:\n"
+                + prof.key_averages(group_by_input_shape=True).table(
+                    sort_by="cpu_time_total", row_limit=10
+                )
+            )
+            logger.info(log_message)
    def capture_one_batch_size(self, bs: int, forward: Callable):
        graph = torch.cuda.CUDAGraph()
@@ -443,7 +474,7 @@ class CudaGraphRunner:
            token_to_kv_pool=self.model_runner.token_to_kv_pool,
            attn_backend=self.model_runner.attn_backend,
            out_cache_loc=out_cache_loc,
-            seq_lens_sum=seq_lens.sum(),
+            seq_lens_sum=seq_lens.sum().item(),
            encoder_lens=encoder_lens,
            return_logprob=False,
            positions=positions,

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
--- a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
+++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
@@ -41,6 +41,9 @@ class EAGLEDraftCudaGraphRunner:
        self.tp_size = self.model_runner.tp_size
        self.topk = model_runner.server_args.speculative_eagle_topk
        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
        server_args = model_runner.server_args
        # Batch sizes to capture

--- a/python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
+++ b/python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
@@ -39,6 +39,9 @@ class EAGLEDraftExtendCudaGraphRunner:
        self.dp_size = model_runner.server_args.dp_size
        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
        self.topk = model_runner.server_args.speculative_eagle_topk
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
        self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
        self.padded_static_len = -1

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -837,6 +837,7 @@ class CustomCacheManager(FileCacheManager):
 def set_ulimit(target_soft_limit=65535):
+    # number of open files
    resource_type = resource.RLIMIT_NOFILE
    current_soft, current_hard = resource.getrlimit(resource_type)
@@ -846,6 +847,18 @@ def set_ulimit(target_soft_limit=65535):
        except ValueError as e:
            logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
+    # stack size
+    resource_type = resource.RLIMIT_STACK
+    current_soft, current_hard = resource.getrlimit(resource_type)
+    target_soft_limit_stack_size = 1024 * target_soft_limit
+    if current_soft < target_soft_limit_stack_size:
+        try:
+            resource.setrlimit(
+                resource_type, (target_soft_limit_stack_size, current_hard)
+            )
+        except ValueError as e:
+            logger.warning(f"Fail to set RLIMIT_STACK: {e}")
 def add_api_key_middleware(app, api_key: str):
    @app.middleware("http")