Fix logging (#796)

325a06c2 · Ying Sheng · GitHub · 79f81629 · 325a06c2 · 325a06c2
Unverified Commit 325a06c2 authored Jul 28, 2024 by Ying Sheng Committed by GitHub Jul 28, 2024
8 changed files
--- a/docs/en/hyperparameter_tuning.md
+++ b/docs/en/hyperparameter_tuning.md
@@ -6,7 +6,7 @@ Achieving a large batch size is the most important thing for attaining high thro

 When the server is running at full load, look for the following in the log:

-```[gpu_id=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417```
+```[gpu=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417```

 ### Tune Your Request Submission Speed
 `#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed.

--- a/python/sglang/lang/backend/openai.py
+++ b/python/sglang/lang/backend/openai.py
@@ -18,7 +18,7 @@ except ImportError as e:
    openai = tiktoken = e


-logger = logging.getLogger("openai")
+logger = logging.getLogger(__name__)


 def create_logit_bias_int(tokenizer):

--- a/python/sglang/srt/managers/controller/infer_batch.py
+++ b/python/sglang/srt/managers/controller/infer_batch.py
@@ -15,6 +15,7 @@ limitations under the License.

 """Meta data for requests and batches"""

+import logging
 import warnings
 from dataclasses import dataclass
 from enum import IntEnum, auto
@@ -40,6 +41,9 @@ global_server_args_dict = {
 }


+logger = logging.getLogger(__name__)
+
+
 class ForwardMode(IntEnum):
    # Prefill a new sequence. This is deprecated now. "EXTEND" covers this case.
    PREFILL = auto()
@@ -379,7 +383,7 @@ class Batch:
            out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)

            if out_cache_loc is None:
-                print("Prefill out of memory. This should never happen.")
+                logger.error("Prefill out of memory. This should never happen.")
                self.tree_cache.pretty_print()
                exit()

@@ -613,7 +617,7 @@ class Batch:
        self.out_cache_loc = self.token_to_kv_pool.alloc(bs)

        if self.out_cache_loc is None:
-            print("Decode out of memory. This should never happen.")
+            logger.error("Decode out of memory. This should never happen.")
            self.tree_cache.pretty_print()
            exit()


--- a/python/sglang/srt/managers/controller/manager_multi.py
+++ b/python/sglang/srt/managers/controller/manager_multi.py
@@ -39,7 +39,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import kill_parent_process
 from sglang.utils import get_exception_traceback

-logger = logging.getLogger("srt.controller")
+logger = logging.getLogger(__name__)


 class LoadBalanceMethod(Enum):

--- a/python/sglang/srt/managers/controller/manager_single.py
+++ b/python/sglang/srt/managers/controller/manager_single.py
@@ -31,7 +31,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import kill_parent_process
 from sglang.utils import get_exception_traceback

-logger = logging.getLogger("srt.controller")
+logger = logging.getLogger(__name__)


 class ControllerSingle:

--- a/python/sglang/srt/managers/controller/model_runner.py
+++ b/python/sglang/srt/managers/controller/model_runner.py
@@ -57,7 +57,7 @@ from sglang.srt.utils import (
    monkey_patch_vllm_qvk_linear_loader,
 )

-logger = logging.getLogger("srt.model_runner")
+logger = logging.getLogger(__name__)


 class ModelRunner:
@@ -90,7 +90,7 @@ class ModelRunner:

        # Init torch distributed
        torch.cuda.set_device(self.gpu_id)
-        logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
+        logger.info(f"[gpu={self.gpu_id}] Init nccl begin.")

        if not server_args.enable_p2p_check:
            monkey_patch_vllm_p2p_access_check(self.gpu_id)
@@ -130,7 +130,7 @@ class ModelRunner:

    def load_model(self):
        logger.info(
-            f"[gpu_id={self.gpu_id}] Load weight begin. "
+            f"[gpu={self.gpu_id}] Load weight begin. "
            f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
        )

@@ -178,7 +178,7 @@ class ModelRunner:
            cache_config=None,
        )
        logger.info(
-            f"[gpu_id={self.gpu_id}] Load weight end. "
+            f"[gpu={self.gpu_id}] Load weight end. "
            f"type={type(self.model).__name__}, "
            f"dtype={self.dtype}, "
            f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
@@ -229,7 +229,7 @@ class ModelRunner:
            layer_num=self.model_config.num_hidden_layers,
        )
        logger.info(
-            f"[gpu_id={self.gpu_id}] Memory pool end. "
+            f"[gpu={self.gpu_id}] Memory pool end. "
            f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
        )

@@ -280,7 +280,7 @@ class ModelRunner:
            return

        logger.info(
-            f"[gpu_id={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
+            f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
        )
        batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)]
        self.cuda_graph_runner = CudaGraphRunner(

--- a/python/sglang/srt/managers/controller/tp_worker.py
+++ b/python/sglang/srt/managers/controller/tp_worker.py
@@ -55,7 +55,7 @@ from sglang.srt.utils import (
 )
 from sglang.utils import get_exception_traceback

-logger = logging.getLogger("srt.tp_worker")
+logger = logging.getLogger(__name__)


 class ModelTpServer:
@@ -132,7 +132,7 @@ class ModelTpServer:

        # Print info
        logger.info(
-            f"[gpu_id={self.gpu_id}] "
+            f"[gpu={self.gpu_id}] "
            f"max_total_num_tokens={self.max_total_num_tokens}, "
            f"max_prefill_tokens={self.max_prefill_tokens}, "
            f"max_running_requests={self.max_running_requests}, "
@@ -256,7 +256,7 @@ class ModelTpServer:
        self.num_generated_tokens = 0
        self.last_stats_tic = time.time()
        logger.info(
-            f"[gpu_id={self.gpu_id}] Decode batch. "
+            f"[gpu={self.gpu_id}] Decode batch. "
            f"#running-req: {len(self.running_batch.reqs)}, "
            f"#token: {num_used}, "
            f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
@@ -434,7 +434,7 @@ class ModelTpServer:
                self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
            )
            logger.info(
-                f"[gpu_id={self.gpu_id}] Prefill batch. "
+                f"[gpu={self.gpu_id}] Prefill batch. "
                f"#new-seq: {len(can_run_list)}, "
                f"#new-token: {new_batch_input_tokens}, "
                f"#cached-token: {hit_tokens}, "

--- a/python/sglang/srt/model_loader/utils.py
+++ b/python/sglang/srt/model_loader/utils.py
@@ -38,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf

 from sglang.srt.layers.quantization import get_quantization_config

-logger = logging.getLogger("srt.model_loader")
+logger = logging.getLogger(__name__)
 temp_dir = tempfile.gettempdir()