[Minor] Fix logger and style (#2325)

3ddb1c46 · Lianmin Zheng · GitHub · 480e38a7 · 3ddb1c46 · 3ddb1c46
Unverified Commit 3ddb1c46 authored Dec 02, 2024 by Lianmin Zheng Committed by GitHub Dec 02, 2024
3 changed files
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -163,7 +163,6 @@ async def async_request_openai_completions(
            "max_tokens": request_func_input.output_len,
            "stream": not args.disable_stream,
            "ignore_eos": not args.disable_ignore_eos,
-            "lora_path": request_func_input.lora_name,
            **request_func_input.extra_request_body,
        }
        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -16,6 +16,7 @@
 import gc
 import json
 import logging
+import time
 from typing import Optional
 import torch
@@ -129,7 +130,7 @@ class ModelRunner:
        # Global vars
        if server_args.show_time_cost:
            enable_show_time_cost()
-        if server_args.disable_disk_cache:
+        if server_args.disable_outlines_disk_cache:
            from outlines.caching import disable_cache
            disable_cache()
@@ -623,8 +624,10 @@ class ModelRunner:
        if self.server_args.disable_cuda_graph:
            return
+        tic = time.time()
        logger.info("Capture cuda graph begin. This can take up to several minutes.")
        self.cuda_graph_runner = CudaGraphRunner(self)
+        logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f}s")
    def apply_torch_tp(self):
        logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -122,7 +122,7 @@ class ServerArgs:
    disable_jump_forward: bool = False
    disable_cuda_graph: bool = False
    disable_cuda_graph_padding: bool = False
-    disable_disk_cache: bool = False
+    disable_outlines_disk_cache: bool = False
    disable_custom_all_reduce: bool = False
    disable_mla: bool = False
    disable_overlap_schedule: bool = False
@@ -159,7 +159,7 @@ class ServerArgs:
            if self.tp_size >= 16:
                self.mem_fraction_static = 0.79
            elif self.tp_size >= 8:
-                self.mem_fraction_static = 0.82
+                self.mem_fraction_static = 0.81
            elif self.tp_size >= 4:
                self.mem_fraction_static = 0.85
            elif self.tp_size >= 2:
@@ -192,7 +192,7 @@ class ServerArgs:
            )
        if self.attention_backend == "torch_native":
-            logger.info(
+            logger.warning(
                "Cuda graph is disabled because of using torch native attention backend"
            )
            self.disable_cuda_graph = True
@@ -204,12 +204,12 @@ class ServerArgs:
            self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
            self.schedule_conservativeness = self.schedule_conservativeness * 0.3
            self.disable_overlap_schedule = True
-            logger.info(
+            logger.warning(
                f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
                f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
                f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
                "Data parallel size is adjusted to be the same as tensor parallel size. "
-                "Overlap schedule is disabled."
+                "Overlap scheduler is disabled."
            )
        # GGUF
@@ -642,9 +642,9 @@ class ServerArgs:
            help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
        )
        parser.add_argument(
-            "--disable-disk-cache",
+            "--disable-outlines-disk-cache",
            action="store_true",
-            help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
+            help="Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency.",
        )
        parser.add_argument(
            "--disable-custom-all-reduce",
@@ -745,6 +745,11 @@ class ServerArgs:
            action=DeprecatedAction,
            help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
        )
+        parser.add_argument(
+            "--disable-disk-cache",
+            action=DeprecatedAction,
+            help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
+        )
    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):